Skip to content

Luna Pathology

analysis

Created on April 27, 2021

@author: pashaa@mskcc.org

ml

BaseTorchClassifier

Bases: Module

Source code in src/luna/pathology/analysis/ml.py
class BaseTorchClassifier(nn.Module):
    def __init__(self, **kwargs):
        """Initialize BaseTorchClassifier

        A generic base class for a PyTorch classifier model. This serves as the base class inhereted
        for model training and inference.

        Will run on cuda if available, on the device specified by the CUDA_VISIBLE_DEVICES environment variable

        Args:
            kwargs: Keyward arguements passed onto the subclass method
        """

        super(BaseTorchClassifier, self).__init__()

        self.cuda_is_available = torch.cuda.is_available()

        self.setup(**kwargs)

        if self.cuda_is_available:
            self.cuda()

    def setup(self, **kwargs):
        """Set classifier modules

        Template/abstract method where individual modules that make up the forward pass are configured

        Args:
            kwargs: Keyword arguements passed onto the subclass method
        """
        raise NotImplementedError("setup() has not been implimented in the subclass!")
__init__(**kwargs)

Initialize BaseTorchClassifier

A generic base class for a PyTorch classifier model. This serves as the base class inhereted for model training and inference.

Will run on cuda if available, on the device specified by the CUDA_VISIBLE_DEVICES environment variable

Parameters:

Name Type Description Default
kwargs

Keyward arguements passed onto the subclass method

{}
Source code in src/luna/pathology/analysis/ml.py
def __init__(self, **kwargs):
    """Initialize BaseTorchClassifier

    A generic base class for a PyTorch classifier model. This serves as the base class inhereted
    for model training and inference.

    Will run on cuda if available, on the device specified by the CUDA_VISIBLE_DEVICES environment variable

    Args:
        kwargs: Keyward arguements passed onto the subclass method
    """

    super(BaseTorchClassifier, self).__init__()

    self.cuda_is_available = torch.cuda.is_available()

    self.setup(**kwargs)

    if self.cuda_is_available:
        self.cuda()
setup(**kwargs)

Set classifier modules

Template/abstract method where individual modules that make up the forward pass are configured

Parameters:

Name Type Description Default
kwargs

Keyword arguements passed onto the subclass method

{}
Source code in src/luna/pathology/analysis/ml.py
def setup(self, **kwargs):
    """Set classifier modules

    Template/abstract method where individual modules that make up the forward pass are configured

    Args:
        kwargs: Keyword arguements passed onto the subclass method
    """
    raise NotImplementedError("setup() has not been implimented in the subclass!")

BaseTorchTileClassifier

Bases: BaseTorchClassifier

Source code in src/luna/pathology/analysis/ml.py
class BaseTorchTileClassifier(BaseTorchClassifier):
    def forward(self, index, tile_data):
        """Forward pass for base classifier class

        Loads a tile image from the tile manifest

        Args:
            index (list[str]): Tile address indicies with length B
            tile_data (torch.tensor): Input tiles of shape (B, *)

        Returns:
            pd.DataFrame: Dataframe of output features
        """
        if self.cuda_is_available:
            tile_data = tile_data.cuda()
        self.eval()
        with torch.no_grad():
            return pd.DataFrame(
                self.predict(tile_data).cpu().numpy(),
                index=index,
            )

    def setup(self, **kwargs):
        """Set classifier modules

        Template/abstract method where individual modules that make up the forward pass are configured

        Args:
            kwargs: Keyword arguements passed onto the subclass method
        """
        raise NotImplementedError("setup() has not been implimented in the subclass!")

    def predict(self, input_tiles: torch.tensor):
        """predict method

        Loads a tile image from the tile manifest, must be manually implimented to pass the input tensor through the modules specified in setup()

        Args:
            input_tiles (torch.tensor): Input tiles of shape (B, *)

        Returns:
            torch.tensor: 2D tensor with (B, C) where B is the batch dimension and C are output classes or features
        """
        raise NotImplementedError("predict() has not been implimented in the subclass!")
forward(index, tile_data)

Forward pass for base classifier class

Loads a tile image from the tile manifest

Parameters:

Name Type Description Default
index list[str]

Tile address indicies with length B

required
tile_data tensor

Input tiles of shape (B, *)

required

Returns:

Type Description

pd.DataFrame: Dataframe of output features

Source code in src/luna/pathology/analysis/ml.py
def forward(self, index, tile_data):
    """Forward pass for base classifier class

    Loads a tile image from the tile manifest

    Args:
        index (list[str]): Tile address indicies with length B
        tile_data (torch.tensor): Input tiles of shape (B, *)

    Returns:
        pd.DataFrame: Dataframe of output features
    """
    if self.cuda_is_available:
        tile_data = tile_data.cuda()
    self.eval()
    with torch.no_grad():
        return pd.DataFrame(
            self.predict(tile_data).cpu().numpy(),
            index=index,
        )
predict(input_tiles)

predict method

Loads a tile image from the tile manifest, must be manually implimented to pass the input tensor through the modules specified in setup()

Parameters:

Name Type Description Default
input_tiles tensor

Input tiles of shape (B, *)

required

Returns:

Type Description

torch.tensor: 2D tensor with (B, C) where B is the batch dimension and C are output classes or features

Source code in src/luna/pathology/analysis/ml.py
def predict(self, input_tiles: torch.tensor):
    """predict method

    Loads a tile image from the tile manifest, must be manually implimented to pass the input tensor through the modules specified in setup()

    Args:
        input_tiles (torch.tensor): Input tiles of shape (B, *)

    Returns:
        torch.tensor: 2D tensor with (B, C) where B is the batch dimension and C are output classes or features
    """
    raise NotImplementedError("predict() has not been implimented in the subclass!")
setup(**kwargs)

Set classifier modules

Template/abstract method where individual modules that make up the forward pass are configured

Parameters:

Name Type Description Default
kwargs

Keyword arguements passed onto the subclass method

{}
Source code in src/luna/pathology/analysis/ml.py
def setup(self, **kwargs):
    """Set classifier modules

    Template/abstract method where individual modules that make up the forward pass are configured

    Args:
        kwargs: Keyword arguements passed onto the subclass method
    """
    raise NotImplementedError("setup() has not been implimented in the subclass!")

BaseTorchTileDataset

Bases: Dataset

Base class for a tile dataset

Impliments the usual torch dataset methods, and additionally provides a decoding of the binary tile data. PIL images can be further preprocessed before becoming torch tensors via an abstract preprocess method

Will send the tensors to gpu if available, on the device specified by CUDA_VISIBLE_DEVICES="1"

Source code in src/luna/pathology/analysis/ml.py
class BaseTorchTileDataset(Dataset):
    """Base class for a tile dataset

    Impliments the usual torch dataset methods, and additionally provides a decoding of the binary tile data.
    PIL images can be further preprocessed before becoming torch tensors via an abstract preprocess method

    Will send the tensors to gpu if available, on the device specified by CUDA_VISIBLE_DEVICES="1"
    """

    def __init__(
        self,
        tile_manifest: pd.DataFrame = None,
        tile_urlpath: str = "",
        label_cols: List[str] = [],
        using_ray: bool = False,
        storage_options: dict = {},
        **kwargs,
    ):
        """Initialize BaseTileDataset

        Can accept either a tile dataframe or a path to tile data

        Args:
            tile_manifest (pd.DataFrame): Dataframe of tile data
            tile_path (str): Base path of tile data
            label_cols (list[str]): (Optional) label columns to return as tensors, e.g. for training
            using_ray (bool): (Optional) Perform distributed dataloading with Ray for training
        """

        if tile_manifest is not None:
            self.tile_manifest = tile_manifest
        elif tile_urlpath:
            with open(tile_urlpath, **storage_options) as of:
                self.tile_manifest = pd.read_parquet(of).set_index("address")
        else:
            raise RuntimeError("Must specifiy either tile_manifest or tile_path")

        self.label_cols = label_cols
        self.using_ray = using_ray

        self.setup(**kwargs)

    def __len__(self):
        return len(self.tile_manifest)

    def __repr__(self):
        return f"TileDataset with {len(self.tile_manifest)} tiles, indexed by {self.tile_manifest.index.names}, returning label columns: {self.label_cols}"

    def __getitem__(self, idx: int):
        """Tile accessor

        Loads a tile image from the tile manifest.  Returns a batch of the indices of the input dataframe, the tile data always.
        If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for
        model training, then only the image data and the label is returned.

        Args:
            idx (int): Integer index

        Returns:
            (optional str, torch.tensor, optional torch.tensor): tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified
        """

        row = self.tile_manifest.iloc[idx]
        img = Image.fromarray(get_tile_array(row))

        if self.using_ray:
            if not (len(self.label_cols)):
                raise ValueError(
                    "If using Ray for training, you must provide a label column"
                )
            return self.preprocess(img), torch.tensor(row[self.label_cols]).squeeze()

        if len(self.label_cols):
            return (
                row.name,
                self.preprocess(img),
                torch.tensor(row[self.label_cols].to_list()),
            )
        else:
            return row.name, self.preprocess(img)

    def setup(self, **kwargs):
        """Set additional attributes for dataset class

        Template/abstract method where a dataset is configured

        Args:
            kwargs: Keyword arguements passed onto the subclass method
        """
        raise NotImplementedError("setup() has not been implimented in the subclass!")

    def preprocess(self, input_tile: Image):
        """Preprocessing method called for each tile patch

        Loads a tile image from the tile manifest, must be manually implimented to accept a single PIL image and return a torch tensor.

        Args:
            input_tile (Image): Integer index

        Returns:
            torch.tensor: Output tile as preprocessed tensor
        """
        raise NotImplementedError(
            "preprocess() has not been implimented in the subclass!"
        )
__getitem__(idx)

Tile accessor

Loads a tile image from the tile manifest. Returns a batch of the indices of the input dataframe, the tile data always. If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for model training, then only the image data and the label is returned.

Parameters:

Name Type Description Default
idx int

Integer index

required

Returns:

Type Description
optional str, torch.tensor, optional torch.tensor

tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified

Source code in src/luna/pathology/analysis/ml.py
def __getitem__(self, idx: int):
    """Tile accessor

    Loads a tile image from the tile manifest.  Returns a batch of the indices of the input dataframe, the tile data always.
    If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for
    model training, then only the image data and the label is returned.

    Args:
        idx (int): Integer index

    Returns:
        (optional str, torch.tensor, optional torch.tensor): tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified
    """

    row = self.tile_manifest.iloc[idx]
    img = Image.fromarray(get_tile_array(row))

    if self.using_ray:
        if not (len(self.label_cols)):
            raise ValueError(
                "If using Ray for training, you must provide a label column"
            )
        return self.preprocess(img), torch.tensor(row[self.label_cols]).squeeze()

    if len(self.label_cols):
        return (
            row.name,
            self.preprocess(img),
            torch.tensor(row[self.label_cols].to_list()),
        )
    else:
        return row.name, self.preprocess(img)
__init__(tile_manifest=None, tile_urlpath='', label_cols=[], using_ray=False, storage_options={}, **kwargs)

Initialize BaseTileDataset

Can accept either a tile dataframe or a path to tile data

Parameters:

Name Type Description Default
tile_manifest DataFrame

Dataframe of tile data

None
tile_path str

Base path of tile data

required
label_cols list[str]

(Optional) label columns to return as tensors, e.g. for training

[]
using_ray bool

(Optional) Perform distributed dataloading with Ray for training

False
Source code in src/luna/pathology/analysis/ml.py
def __init__(
    self,
    tile_manifest: pd.DataFrame = None,
    tile_urlpath: str = "",
    label_cols: List[str] = [],
    using_ray: bool = False,
    storage_options: dict = {},
    **kwargs,
):
    """Initialize BaseTileDataset

    Can accept either a tile dataframe or a path to tile data

    Args:
        tile_manifest (pd.DataFrame): Dataframe of tile data
        tile_path (str): Base path of tile data
        label_cols (list[str]): (Optional) label columns to return as tensors, e.g. for training
        using_ray (bool): (Optional) Perform distributed dataloading with Ray for training
    """

    if tile_manifest is not None:
        self.tile_manifest = tile_manifest
    elif tile_urlpath:
        with open(tile_urlpath, **storage_options) as of:
            self.tile_manifest = pd.read_parquet(of).set_index("address")
    else:
        raise RuntimeError("Must specifiy either tile_manifest or tile_path")

    self.label_cols = label_cols
    self.using_ray = using_ray

    self.setup(**kwargs)
preprocess(input_tile)

Preprocessing method called for each tile patch

Loads a tile image from the tile manifest, must be manually implimented to accept a single PIL image and return a torch tensor.

Parameters:

Name Type Description Default
input_tile Image

Integer index

required

Returns:

Type Description

torch.tensor: Output tile as preprocessed tensor

Source code in src/luna/pathology/analysis/ml.py
def preprocess(self, input_tile: Image):
    """Preprocessing method called for each tile patch

    Loads a tile image from the tile manifest, must be manually implimented to accept a single PIL image and return a torch tensor.

    Args:
        input_tile (Image): Integer index

    Returns:
        torch.tensor: Output tile as preprocessed tensor
    """
    raise NotImplementedError(
        "preprocess() has not been implimented in the subclass!"
    )
setup(**kwargs)

Set additional attributes for dataset class

Template/abstract method where a dataset is configured

Parameters:

Name Type Description Default
kwargs

Keyword arguements passed onto the subclass method

{}
Source code in src/luna/pathology/analysis/ml.py
def setup(self, **kwargs):
    """Set additional attributes for dataset class

    Template/abstract method where a dataset is configured

    Args:
        kwargs: Keyword arguements passed onto the subclass method
    """
    raise NotImplementedError("setup() has not been implimented in the subclass!")

HDF5Dataset

Bases: Dataset

General dataset that uses a HDF5 manifest convention

Applies preprocessing steps per instance, returning aggregate batches of data. Useful for training and inference.

Source code in src/luna/pathology/analysis/ml.py
class HDF5Dataset(Dataset):
    """General dataset that uses a HDF5 manifest convention

    Applies preprocessing steps per instance, returning aggregate batches of data. Useful for training and inference.
    """

    def __init__(
        self,
        hdf5_manifest,
        preprocess=nn.Identity(),
        label_cols=[],
        using_ray=False,
        storage_options={},
    ):
        """Initialize HD5FDataset

        Args:
            hdf5_manifest (pd.DataFrame): Dataframe of H5 data
            preprocess (transform): Function to apply to every bit of data
            label_cols (list[str]): (Optional) label columns to return as tensors, e.g. for training
            using_ray (bool): (Optional) Perform distributed dataloading with Ray for training
        """
        self.hdf5_manifest = hdf5_manifest
        self.label_cols = label_cols
        self.using_ray = using_ray
        self.preprocess = preprocess
        self.storage_options = storage_options

    def __len__(self):
        return len(self.hdf5_manifest)

    def set_preprocess(self, preprocess):
        preprocess = preprocess

    def __repr__(self):
        return f"HD5FDataset with {len(self.hd5f_manifest)} tiles, indexed by {self.hd5f_manifest.index.names}, returning label columns: {self.label_cols}"

    def __getitem__(self, idx: int):
        """Tile accessor

        Loads a tile image from the tile manifest.  Returns a batch of the indices of the input dataframe, the tile data always.
        If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for
        model training, then only the image data and the label is returned.

        Args:
            idx (int): Integer index

        Returns:
            (optional str, torch.tensor, optional torch.tensor): tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified, else the index
        """

        row = self.hdf5_manifest.iloc[idx]
        img = get_tile_array(row, self.storage_options)

        if self.using_ray and not (len(self.label_cols)):
            raise ValueError(
                "If using Ray for training, you must provide a label column"
            )
        if len(self.label_cols):
            return self.preprocess(img), torch.tensor(row[self.label_cols]).squeeze()
        else:
            return self.preprocess(img), row.name
__getitem__(idx)

Tile accessor

Loads a tile image from the tile manifest. Returns a batch of the indices of the input dataframe, the tile data always. If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for model training, then only the image data and the label is returned.

Parameters:

Name Type Description Default
idx int

Integer index

required

Returns:

Type Description
optional str, torch.tensor, optional torch.tensor

tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified, else the index

Source code in src/luna/pathology/analysis/ml.py
def __getitem__(self, idx: int):
    """Tile accessor

    Loads a tile image from the tile manifest.  Returns a batch of the indices of the input dataframe, the tile data always.
    If label columns where specified, the 3rd position of the tuple is a tensor of the label data. If Ray is being used for
    model training, then only the image data and the label is returned.

    Args:
        idx (int): Integer index

    Returns:
        (optional str, torch.tensor, optional torch.tensor): tuple of the tile index and corresponding tile as a torch tensor, and metadata labels if specified, else the index
    """

    row = self.hdf5_manifest.iloc[idx]
    img = get_tile_array(row, self.storage_options)

    if self.using_ray and not (len(self.label_cols)):
        raise ValueError(
            "If using Ray for training, you must provide a label column"
        )
    if len(self.label_cols):
        return self.preprocess(img), torch.tensor(row[self.label_cols]).squeeze()
    else:
        return self.preprocess(img), row.name
__init__(hdf5_manifest, preprocess=nn.Identity(), label_cols=[], using_ray=False, storage_options={})

Initialize HD5FDataset

Parameters:

Name Type Description Default
hdf5_manifest DataFrame

Dataframe of H5 data

required
preprocess transform

Function to apply to every bit of data

Identity()
label_cols list[str]

(Optional) label columns to return as tensors, e.g. for training

[]
using_ray bool

(Optional) Perform distributed dataloading with Ray for training

False
Source code in src/luna/pathology/analysis/ml.py
def __init__(
    self,
    hdf5_manifest,
    preprocess=nn.Identity(),
    label_cols=[],
    using_ray=False,
    storage_options={},
):
    """Initialize HD5FDataset

    Args:
        hdf5_manifest (pd.DataFrame): Dataframe of H5 data
        preprocess (transform): Function to apply to every bit of data
        label_cols (list[str]): (Optional) label columns to return as tensors, e.g. for training
        using_ray (bool): (Optional) Perform distributed dataloading with Ray for training
    """
    self.hdf5_manifest = hdf5_manifest
    self.label_cols = label_cols
    self.using_ray = using_ray
    self.preprocess = preprocess
    self.storage_options = storage_options

TorchTransformModel

Source code in src/luna/pathology/analysis/ml.py
class TorchTransformModel:
    def get_preprocess(self, **kwargs):
        """The transform model's preprocessing code

        Args:
            kwargs: Keyword arguements passed onto the subclass method
        """
        raise NotImplementedError(
            "get_preprocess() has not been implimented in the subclass!"
        )

    def transform(self, X: torch.Tensor):
        """Main transformer method, X -> X'

        Args:
            X (torch.Tensor): input tensor

        Returns:
            torch.tensor: Output tile as preprocessed tensor
        """
        raise NotImplementedError(
            "transform() has not been implimented in the subclass!"
        )

    pass
get_preprocess(**kwargs)

The transform model's preprocessing code

Parameters:

Name Type Description Default
kwargs

Keyword arguements passed onto the subclass method

{}
Source code in src/luna/pathology/analysis/ml.py
def get_preprocess(self, **kwargs):
    """The transform model's preprocessing code

    Args:
        kwargs: Keyword arguements passed onto the subclass method
    """
    raise NotImplementedError(
        "get_preprocess() has not been implimented in the subclass!"
    )
transform(X)

Main transformer method, X -> X'

Parameters:

Name Type Description Default
X Tensor

input tensor

required

Returns:

Type Description

torch.tensor: Output tile as preprocessed tensor

Source code in src/luna/pathology/analysis/ml.py
def transform(self, X: torch.Tensor):
    """Main transformer method, X -> X'

    Args:
        X (torch.Tensor): input tensor

    Returns:
        torch.tensor: Output tile as preprocessed tensor
    """
    raise NotImplementedError(
        "transform() has not been implimented in the subclass!"
    )

get_group_stratified_sampler(df_nh, label_col, group_col, num_splits=5, random_seed=42)

Generates sampler indices for torch DataLoader object that are stratified by a given group set (ie a column in a dataframe corresponding to patient identifiers), and balanced between target labels

Parameters:

Name Type Description Default
df_nh DataFrame

A non-hierarchical/non-multi-indexed/flat dataframe

required
label_col str

The column name for the classes to balance across training and validation splits.

required
group_col str

The column name used to stratify the data (ie patient ids).

required
num_splits int

(Optional) The number of folds, must at least be 2.

5

Returns: Tuple(List, List): a tuple of indices that correspond to training and validation samplers

Source code in src/luna/pathology/analysis/ml.py
def get_group_stratified_sampler(
    df_nh: pd.DataFrame,
    label_col: str,
    group_col: str,
    num_splits: int = 5,
    random_seed: int = 42,
) -> Tuple[List, List]:
    """Generates sampler indices for torch DataLoader object that are
    stratified by a given group set (ie a column in a dataframe
    corresponding to patient identifiers), and balanced between target
    labels

    Args:
        df_nh (pd.DataFrame): A non-hierarchical/non-multi-indexed/flat dataframe
        label_col (str): The column name for the classes to balance across training and validation splits.
        group_col (str): The column name used to stratify the data (ie patient ids).
        num_splits (int): (Optional) The number of folds, must at least be 2.
    Returns:
        Tuple(List, List): a tuple of indices that correspond to training and validation samplers
    """

    cv = StratifiedGroupKFold(
        n_splits=num_splits, random_state=random_seed, shuffle=True
    )
    classes = df_nh[label_col]
    groups = df_nh[group_col]
    for fold_idx, (train_indices, val_indices) in enumerate(
        cv.split(df_nh, classes, groups)
    ):
        # check integrity. asserts that same group (ie patients) aren't in both
        # train and validation splits
        train_groups, val_groups = groups[train_indices], groups[val_indices]
        assert len(set(train_groups) & set(val_groups)) == 0

    train_sampler = SubsetRandomSampler(train_indices)
    val_sampler = SubsetRandomSampler(val_indices)

    return (train_sampler, val_sampler)

post_transform_to_2d(input)

Convert input to a 2D numpy array on CPU

Parameters:

Name Type Description Default
input tensor

tensor input of shape [B, *] where B is the batch dimension

required
Source code in src/luna/pathology/analysis/ml.py
def post_transform_to_2d(input: np.array) -> np.array:
    """Convert input to a 2D numpy array on CPU

    Args:
        input (torch.tensor): tensor input of shape [B, *] where B is the batch dimension
    """
    if type(input) == torch.tensor:
        input = input.cpu.numpy()

    if not len(input.shape) == 2:
        warnings.warn(f"Reshaping model output (was {input.shape}) to 2D")
        input = np.reshape(input, (input.shape[0], -1))

    return input

cli

Created on April 27, 2021

@author: pashaa@mskcc.org

create_wide_shape_features_query

cli(shape_features_urlpath, storage_options={})

Prints wide shape features query for Dremio

Parameters:

Name Type Description Default
shape_features_urlpaths List[str]

URL/path to shape features parquet files

required
storage_options dict

storage options to pass to reading functions

{}
Source code in src/luna/pathology/cli/create_wide_shape_features_query.py
def cli(
    shape_features_urlpath: str,
    storage_options: dict = {}
):
    """Prints wide shape features query for Dremio

    Args:
        shape_features_urlpaths (List[str]): URL/path to shape features parquet files
        storage_options (dict): storage options to pass to reading functions
    """
    config = get_config(vars())
    query = create_wide_shape_features_query(
        config['shape_features_urlpath'],
        config['storage_options']
    )

    print(query)

create_wide_shape_features_query(shape_features_urlpath, storage_options={})

Gets wide shape features query for dremio

Parameters:

Name Type Description Default
shape_features_urlpaths List[str]

URL/path to shape feature parquet files

required
storage_options dict

storage options to pass to reading functions

{}
Source code in src/luna/pathology/cli/create_wide_shape_features_query.py
def create_wide_shape_features_query(
    shape_features_urlpath: str,
    storage_options: dict = {},
):
    """Gets wide shape features query for dremio

    Args:
        shape_features_urlpaths (List[str]): URL/path to shape feature parquet files
        storage_options (dict): storage options to pass to reading functions
    """
    with open(shape_features_urlpath, **storage_options) as of:
        df = pd.read_parquet(of)
    ShapeFeaturesSchema.validate(df)
    df['merged_variable'] = df.Parent + " " + df.Class + " " + df.variable
    return create_query(df['merged_variable'].unique())

dsa_annotation_etl

DsaAnnotationProcessor

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
class DsaAnnotationProcessor:
    def __init__(self, girder, annotation_name, output_urlpath, storage_options):
        self.girder = girder
        self.annotation_name = annotation_name
        self.output_urlpath = output_urlpath
        self.storage_options = storage_options

    def histomics_annotation_table_to_geojson(
        self, df, properties, shape_type_col="type", x_col="x_coords", y_col="y_coords"
    ):
        """Takes a table generated by histomicstk (parse_slide_annotations_into_tables) and creates a geojson"""

        features = []
        df[properties] = df[properties].fillna("None")

        logger.info(f"About to turn {len(df)} geometric annotations into a geojson!")

        for _, row in df.iterrows():
            x, y = deepcopy(row[x_col]), deepcopy(row[y_col])
            if row[shape_type_col] == "polyline":
                x.append(x[0]), y.append(y[0])
                geometry = Polygon(
                    [list(zip(x, y))]
                )  # Polygons are once nested to account for holes

            elif row[shape_type_col] == "point":
                geometry = Point((x[0], y[0]))
            else:
                continue  # don't process non-polyline(regional) or point annotations

            logger.info(f"\tCreated geometry {str(shape(geometry)):.40s}...")
            feature = Feature(
                geometry=geometry, properties={prop: row[prop] for prop in properties}
            )
            features.append(feature)

        feature_collection = FeatureCollection(features)
        logger.info(
            f"Checking geojson, errors with geojson FeatureCollection: {feature_collection.errors()}"
        )

        return feature_collection

    def build_proxy_repr_dsa(self, row):
        """Build a proxy table slice given, primarily, a DSA itemId (slide_item_uuid)"""

        itemId = row.slide_item_uuid
        slide_id = row.slide_id

        logger.info(
            f"Trying to process annotation for slide_id={slide_id}, item_id={itemId}"
        )

        annotation_uuids = get_annotation_uuid(
            self.girder, item_id=itemId, annotation_name=self.annotation_name
        )

        if annotation_uuids is None:
            return None

        # need to loop through annotation uuids since the same annotation name
        # can coorespond to multiple uuids (a 'Regional' annotation on the same
        # slide made two days apart)
        df_annotations = []
        for annotation_uuid in annotation_uuids:
            df_annotation = get_annotation_df(self.girder, annotation_uuid)
            df_annotations.append(df_annotation)

        df_annotations = pd.concat(df_annotations)

        # This turns the regional data into a nice geojson
        feature_collection = self.histomics_annotation_table_to_geojson(
            df_annotations,
            ["annotation_girder_id", "element_girder_id", "group", "label"],
            shape_type_col="type",
            x_col="x_coords",
            y_col="y_coords",
        )

        fs, urlpath = fsspec.core.url_to_fs(self.output_urlpath, **self.storage_options)

        slide_geojson_path = str(Path(urlpath) / f"{slide_id}.annotation.geojson")
        with fs.open(slide_geojson_path, "w") as fp:
            json.dump(feature_collection, fp)  # Finally, save it!

        df_annotation_proxy = pd.concat(
            [
                df_annotations,
                pd.DataFrame(
                    [
                        {
                            "slide_item_uuid": itemId,
                            "type": "geojson",
                            "slide_geojson": slide_geojson_path,
                        }
                    ]
                ),
            ]
        )  # Add our geojson as a special type of annotation

        return df_annotation_proxy

    def run(self, row):
        """Run DsaAnnotationProcessor

        Args:
            row (string): row of a DSA slide table

        Returns:
            pd.DataFrame: annotation metadata
        """

        df = self.build_proxy_repr_dsa(row)

        return df
build_proxy_repr_dsa(row)

Build a proxy table slice given, primarily, a DSA itemId (slide_item_uuid)

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
def build_proxy_repr_dsa(self, row):
    """Build a proxy table slice given, primarily, a DSA itemId (slide_item_uuid)"""

    itemId = row.slide_item_uuid
    slide_id = row.slide_id

    logger.info(
        f"Trying to process annotation for slide_id={slide_id}, item_id={itemId}"
    )

    annotation_uuids = get_annotation_uuid(
        self.girder, item_id=itemId, annotation_name=self.annotation_name
    )

    if annotation_uuids is None:
        return None

    # need to loop through annotation uuids since the same annotation name
    # can coorespond to multiple uuids (a 'Regional' annotation on the same
    # slide made two days apart)
    df_annotations = []
    for annotation_uuid in annotation_uuids:
        df_annotation = get_annotation_df(self.girder, annotation_uuid)
        df_annotations.append(df_annotation)

    df_annotations = pd.concat(df_annotations)

    # This turns the regional data into a nice geojson
    feature_collection = self.histomics_annotation_table_to_geojson(
        df_annotations,
        ["annotation_girder_id", "element_girder_id", "group", "label"],
        shape_type_col="type",
        x_col="x_coords",
        y_col="y_coords",
    )

    fs, urlpath = fsspec.core.url_to_fs(self.output_urlpath, **self.storage_options)

    slide_geojson_path = str(Path(urlpath) / f"{slide_id}.annotation.geojson")
    with fs.open(slide_geojson_path, "w") as fp:
        json.dump(feature_collection, fp)  # Finally, save it!

    df_annotation_proxy = pd.concat(
        [
            df_annotations,
            pd.DataFrame(
                [
                    {
                        "slide_item_uuid": itemId,
                        "type": "geojson",
                        "slide_geojson": slide_geojson_path,
                    }
                ]
            ),
        ]
    )  # Add our geojson as a special type of annotation

    return df_annotation_proxy
histomics_annotation_table_to_geojson(df, properties, shape_type_col='type', x_col='x_coords', y_col='y_coords')

Takes a table generated by histomicstk (parse_slide_annotations_into_tables) and creates a geojson

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
def histomics_annotation_table_to_geojson(
    self, df, properties, shape_type_col="type", x_col="x_coords", y_col="y_coords"
):
    """Takes a table generated by histomicstk (parse_slide_annotations_into_tables) and creates a geojson"""

    features = []
    df[properties] = df[properties].fillna("None")

    logger.info(f"About to turn {len(df)} geometric annotations into a geojson!")

    for _, row in df.iterrows():
        x, y = deepcopy(row[x_col]), deepcopy(row[y_col])
        if row[shape_type_col] == "polyline":
            x.append(x[0]), y.append(y[0])
            geometry = Polygon(
                [list(zip(x, y))]
            )  # Polygons are once nested to account for holes

        elif row[shape_type_col] == "point":
            geometry = Point((x[0], y[0]))
        else:
            continue  # don't process non-polyline(regional) or point annotations

        logger.info(f"\tCreated geometry {str(shape(geometry)):.40s}...")
        feature = Feature(
            geometry=geometry, properties={prop: row[prop] for prop in properties}
        )
        features.append(feature)

    feature_collection = FeatureCollection(features)
    logger.info(
        f"Checking geojson, errors with geojson FeatureCollection: {feature_collection.errors()}"
    )

    return feature_collection
run(row)

Run DsaAnnotationProcessor

Parameters:

Name Type Description Default
row string

row of a DSA slide table

required

Returns:

Type Description

pd.DataFrame: annotation metadata

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
def run(self, row):
    """Run DsaAnnotationProcessor

    Args:
        row (string): row of a DSA slide table

    Returns:
        pd.DataFrame: annotation metadata
    """

    df = self.build_proxy_repr_dsa(row)

    return df

cli(dsa_endpoint='???', collection_name='???', annotation_name='???', username='${oc.env:DSA_USERNAME}', password='${oc.env:DSA_PASSWORD}', local_config='', output_urlpath='.', storage_options={})

DSA annotation ETL Args: dsa_endpoint (str): path to input data collection_name (str): collection name in DSA annotation_name (str): annotation name username (str): DSA username (defaults to environment variable DSA_USERNAME) password (str): DSA password (defaults to environment variable DSA_PASSWORD) local_config (str): local config yaml url/path output_urlpath (str): output/working url/path prefix storage_options (dict): options to pass to reading/writing functions

Returns:

Type Description

pd.DataFrame: metadata from function call

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
@timed
@save_metadata
def cli(
    dsa_endpoint: str = "???",
    collection_name: str = "???",
    annotation_name: str = "???",
    username: str = "${oc.env:DSA_USERNAME}",
    password: str = "${oc.env:DSA_PASSWORD}",
    local_config: str = "",
    output_urlpath: str = ".",
    storage_options: dict = {},
):
    """DSA annotation ETL
    Args:
        dsa_endpoint (str): path to input data
        collection_name (str): collection name in DSA
        annotation_name (str): annotation name
        username (str): DSA username (defaults to environment variable DSA_USERNAME)
        password (str): DSA password (defaults to environment variable DSA_PASSWORD)
        local_config (str): local config yaml url/path
        output_urlpath (str): output/working url/path prefix
        storage_options (dict): options to pass to reading/writing functions

    Returns:
        pd.DataFrame: metadata from function call
    """
    config = get_config(vars())

    configure_dask_client()

    df_full_annotation_data = dsa_annotation_etl(
        config["dsa_endpoint"],
        config["collection_name"],
        config["annotation_name"],
        config["username"],
        config["password"],
        config["output_urlpath"],
        config["storage_options"],
    )

    output_fs, output_path = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["storage_options"]
    )

    slide_annotation_dataset_path = str(
        Path(output_path)
        / f"slide_annotation_dataset_{config['collection_name']}_{config['annotation_name']}.parquet"
    )

    if len(df_full_annotation_data) > 0:
        with output_fs.open(slide_annotation_dataset_path, "wb") as of:
            df_full_annotation_data.to_parquet(of)

        properties = {
            "slide_annotation_dataset": slide_annotation_dataset_path,
            "segment_keys": {
                "dsa_collection_uuid": df_full_annotation_data["collection_uuid"][0]
            },
        }
        return properties

dsa_annotation_etl(dsa_endpoint, collection_name, annotation_name, username, password, output_urlpath, storage_options)

DSA annotation ETL

Parameters:

Name Type Description Default
dsa_endpoint str

path to input data

required
collection_name str

collection name in DSA

required
annotation_name str

annotation name

required
username str

DSA username

required
password str

DSA password

required
output_urlpath str

output/working url/path prefix

required
storage_options dict

options to pass to reading/writing functions

required

Returns:

Type Description

pd.DataFrame: slide etl dataframe with annotation columns

Source code in src/luna/pathology/cli/dsa_annotation_etl.py
def dsa_annotation_etl(
    dsa_endpoint: str,
    collection_name: str,
    annotation_name: str,
    username: str,
    password: str,
    output_urlpath: str,
    storage_options: dict,
):
    """DSA annotation ETL

    Args:
        dsa_endpoint (str): path to input data
        collection_name (str): collection name in DSA
        annotation_name (str): annotation name
        username (str): DSA username
        password (str): DSA password
        output_urlpath (str): output/working url/path prefix
        storage_options (dict): options to pass to reading/writing functions

    Returns:
        pd.DataFrame: slide etl dataframe with annotation columns
    """
    client = get_or_create_dask_client()
    # girder = girder_client.GirderClient(apiUrl=dsa_endpoint)
    try:
        girder = girder_client.GirderClient(apiUrl=dsa_endpoint)
        # girder python client doesn't support turning off ssl verify.
        # can be removed once we replace the self-signed cert
        session = requests.Session()
        session.verify = False
        girder._session = session
        girder.authenticate(username, password)

        # check DSA connection
        system_check(girder)

    except Exception as exc:
        logger.error(exc)
        raise RuntimeError("Error connecting to DSA API")

    # dsa_authenticate(girder, username, password)

    collection_uuid = get_collection_uuid(girder, collection_name)

    df_slide_items = get_slide_df(girder, collection_uuid)

    if len(df_slide_items) == 0:
        logger.info("No slides found, exitting!")
        return {}

    # Initialize the DsaAnnotationProcessor
    dap = DsaAnnotationProcessor(
        girder, annotation_name, output_urlpath, storage_options
    )

    logger.info("Dashboard: " + client.dashboard_link)
    df_polygon_data = pd.concat(
        [
            x.result()
            for x in as_completed(
                [client.submit(dap.run, row) for _, row in df_slide_items.iterrows()]
            )
        ]
    )

    # Join the slide level data with the polygon level data, so this is a lot of information!
    df_full_annotation_data = (
        df_slide_items.set_index("slide_item_uuid")
        .join(
            df_polygon_data.set_index("slide_item_uuid"),
            how="right",
            rsuffix="annotation",
        )
        .set_index("slide_id")
    )

    df_full_annotation_data.loc[:, "collection_uuid"] = collection_uuid
    df_full_annotation_data.loc[:, "collection_name"] = collection_name
    df_full_annotation_data.loc[:, "annotation_name"] = annotation_name
    df_full_annotation_data = df_full_annotation_data.drop(columns=["meta"])
    df_full_annotation_data = df_full_annotation_data.rename(
        columns={"group": "group_name"}
    )

    print(df_full_annotation_data)

    # Our dataset is a combination of polyline, point, and geojson annotations!
    logger.info(
        f"""Created {len(df_full_annotation_data.query("type=='geojson'"))} geojsons, {len(df_full_annotation_data.query("type=='point'"))} points, and {len(df_full_annotation_data.query("type=='polyline'"))} polygons"""
    )

    return df_full_annotation_data

dsa_upload

__upload_annotation_to_dsa(gc, dsa_endpoint_url, annotation_file_urlpath, collection_name, image_filename, force=False, storage_options={})

Upload annotation to DSA

Upload json annotation file as a new annotation to the image in the DSA collection.

Parameters:

Name Type Description Default
dsa_endpoint_url string

DSA API endpoint e.g. http://localhost:8080/api/v1

required
annotation_file_urlpath string

URL/path to a DSA annotation json file

required
collection_name string

name of the collection in DSA

required
image_filename string

name of the image file in DSA e.g. 123.svs

required
username string

DSA username

required
password string

DSA password

required
storage_options dict

options to pass to reading functions

{}

Returns:

Name Type Description
dict

item_uuid. None if item doesn't exist

Source code in src/luna/pathology/cli/dsa_upload.py
def __upload_annotation_to_dsa(
    gc: girder_client.GirderClient,
    dsa_endpoint_url: str,
    annotation_file_urlpath: str,
    collection_name: str,
    image_filename: str,
    force: bool = False,
    storage_options: dict = {},
):
    """Upload annotation to DSA

    Upload json annotation file as a new annotation to the image in the DSA collection.

    Args:
        dsa_endpoint_url (string): DSA API endpoint e.g. http://localhost:8080/api/v1
        annotation_file_urlpath (string): URL/path to a DSA annotation json file
        collection_name (string): name of the collection in DSA
        image_filename (string): name of the image file in DSA e.g. 123.svs
        username (string): DSA username
        password (string): DSA password
        storage_options (dict): options to pass to reading functions

    Returns:
        dict: item_uuid. None if item doesn't exist
    """

    with open(annotation_file_urlpath, **storage_options).open() as annotation_json:
        dsa_annotation = json.load(annotation_json)

    if not force:
        slide_annotation = get_slide_annotation(
            image_filename, dsa_annotation["name"], collection_name, gc
        )
        if slide_annotation:
            logger.info(
                f"Found {slide_annotation[1]['annotation_id']}: slide {image_filename} in collection {collection_name} already has an annotation named {dsa_annotation['name']}"
            )
            return slide_annotation[1]["annotation_id"]

    dsa_uuid = get_item_uuid(gc, image_filename, collection_name)

    if dsa_uuid:
        dsa_uuid = push_annotation_to_dsa_image(
            dsa_uuid,
            annotation_file_urlpath,
            dsa_endpoint_url[:-6],
            gc,
            storage_options,
        )

    return dsa_uuid

cli(dsa_endpoint_url='???', annotation_file_urlpath='', annotation_file_list_urlpath='', collection_name='???', image_filename='', username='${oc.env:DSA_USERNAME}', password='${oc.env:DSA_PASSWORD}', force=False, insecure=False, storage_options={}, local_config='')

Upload annotation to DSA

Upload json annotation file as a new annotation to the image in the DSA collection.

Parameters:

Name Type Description Default
dsa_endpoint_url string

DSA API endpoint e.g. http://localhost:8080/api/v1

'???'
annotation_file_urlpath string

URL/path to a DSA annotation json file

''
annotation_file_list_urlpath string

URL/path to a DSA annotation json file

''
collection_name string

name of the collection in DSA

'???'
image_filename string

name of the image file in DSA e.g. 123.svs. If not specified, infer from annotiaton_file_urpath

''
username string

DSA username (defaults to environment variable DSA_USERNAME)

'${oc.env:DSA_USERNAME}'
password string

DSA password (defaults to environment variable DSA_PASSWORD)

'${oc.env:DSA_PASSWORD}'
force bool

upload even if annotation with same name exists for the slide

False
insecure bool

insecure ssl

False
storage_options dict

options to pass to reading functions

{}
local_config string

local config yaml url/path

''

Returns:

Name Type Description
dict

metadata

Source code in src/luna/pathology/cli/dsa_upload.py
@timed
@save_metadata
def cli(
    dsa_endpoint_url: str = "???",
    annotation_file_urlpath: str = "",
    annotation_file_list_urlpath: str = "",
    collection_name: str = "???",
    image_filename: str = "",
    username: str = "${oc.env:DSA_USERNAME}",
    password: str = "${oc.env:DSA_PASSWORD}",
    force: bool = False,
    insecure: bool = False,
    storage_options: dict = {},
    local_config: str = "",
):
    """Upload annotation to DSA

    Upload json annotation file as a new annotation to the image in the DSA collection.

    Args:
        dsa_endpoint_url (string): DSA API endpoint e.g. http://localhost:8080/api/v1
        annotation_file_urlpath (string): URL/path to a DSA annotation json file
        annotation_file_list_urlpath (string): URL/path to a DSA annotation json file
        collection_name (string): name of the collection in DSA
        image_filename (string): name of the image file in DSA e.g. 123.svs. If not specified, infer from annotiaton_file_urpath
        username (string): DSA username (defaults to environment variable DSA_USERNAME)
        password (string): DSA password (defaults to environment variable DSA_PASSWORD)
        force (bool): upload even if annotation with same name exists for the slide
        insecure (bool): insecure ssl
        storage_options (dict): options to pass to reading functions
        local_config (string): local config yaml url/path

    Returns:
        dict: metadata
    """
    config = get_config(vars())

    if (
        not config["annotation_file_urlpath"]
        and not config["annotation_file_list_urlpath"]
    ):
        raise fire.core.FireError(
            "Specify either annotation_file_urlpath or annotation_file_list_urlpath"
        )

    annotation_file_urlpaths = []
    if config["annotation_file_urlpath"]:
        annotation_file_urlpaths.append(config["annotation_file_urlpath"])
    if config["annotation_file_list_urlpath"]:
        with open(config["annotation_file_list_urlpath"], "r") as of:
            data = of.read()
            annotation_file_urlpaths += data.split("\n")

    uuids = []
    for idx, annotation_file_urlpath in enumerate(annotation_file_urlpaths):
        logger.info(
            f"Uploading {annotation_file_urlpath}: {idx+1}/{len(annotation_file_urlpaths)}"
        )
        image_filename = config["image_filename"]
        if not image_filename:
            image_filename = Path(annotation_file_urlpath).with_suffix(".svs").name
            image_filename = re.sub(".*_", "", image_filename)
            if not image_filename:
                raise ValueError(
                    f"Unable to infer image_filename from {annotation_file_urlpath}"
                )
            logger.info(f"Image filename inferred as {image_filename}")
        dsa_uuid = _upload_annotation_to_dsa(
            config["dsa_endpoint_url"],
            annotation_file_urlpath,
            config["collection_name"],
            image_filename,
            config["username"],
            config["password"],
            config["force"],
            config["insecure"],
            config["storage_options"],
        )
        logger.info(f"Uploaded item to {dsa_uuid}")
        if dsa_uuid:
            uuids.append(dsa_uuid)

    return {"item_uuids": uuids}

upload_annotation_to_dsa(dsa_endpoint_url, slide_manifest, annotation_column, collection_name, image_filename, username, password, force=False, insecure=False, storage_options={})

Upload annotation to DSA

Upload json annotation file as a new annotation to the image in the DSA collection.

Parameters:

Name Type Description Default
dsa_endpoint_url string

DSA API endpoint e.g. http://localhost:8080/api/v1

required
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
annotation_column string

annotation column of slide_manifest containing the dsa url

required
collection_name string

name of the collection in DSA

required
image_filename string

name of the image file in DSA e.g. 123.svs. If not specified, infer from annotiaton_file_urpath

required
username string

DSA username (defaults to environment variable DSA_USERNAME)

required
password string

DSA password (defaults to environment variable DSA_PASSWORD)

required
force bool

upload even if annotation with same name exists for the slide

False
insecure bool

insecure ssl

False
storage_options dict

options to pass to reading functions

{}

Returns:

Type Description

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/dsa_upload.py
def upload_annotation_to_dsa(
    dsa_endpoint_url: str,
    slide_manifest: DataFrame[SlideSchema],
    annotation_column: str,
    collection_name: str,
    image_filename: str,
    username: str,
    password: str,
    force: bool = False,
    insecure: bool = False,
    storage_options: dict = {},
):
    """Upload annotation to DSA

    Upload json annotation file as a new annotation to the image in the DSA collection.

    Args:
        dsa_endpoint_url (string): DSA API endpoint e.g. http://localhost:8080/api/v1
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        annotation_column (string): annotation column of slide_manifest containing the dsa url
        collection_name (string): name of the collection in DSA
        image_filename (string): name of the image file in DSA e.g. 123.svs. If not specified, infer from annotiaton_file_urpath
        username (string): DSA username (defaults to environment variable DSA_USERNAME)
        password (string): DSA password (defaults to environment variable DSA_PASSWORD)
        force (bool): upload even if annotation with same name exists for the slide
        insecure (bool): insecure ssl
        storage_options (dict): options to pass to reading functions

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    uuids = []
    for _, slide in slide_manifest.iterrows():
        uuids = _upload_annotation_to_dsa(
            dsa_endpoint_url,
            slide[annotation_column],
            collection_name,
            image_filename,
            username,
            password,
            force,
            insecure,
            storage_options,
        )
        uuids.append(uuids[0])
    return slide_manifest.assign(**{annotation_column: uuids})

dsa_viz

__bmp_polygon(input_urlpath, output_urlpath, image_filename, label_map, annotation_name, line_colors=None, fill_colors=None, scale_factor=1, storage_options={}, output_storage_options={})

Build DSA annotation json from a BMP with multiple labels.

Vectorizes and simplifies contours per label.

Parameters:

Name Type Description Default
input_urlpath string

url/path to bmp file

required
label_map dict[int, str]

map of label number to label name

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict[str, str]

line color map with {feature name:rgb values}

None
fill_colors dict[str, str]

fill color map with {feature name:rgba values}

None
scale_factor int

scale to match image DSA.

1
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

DSA annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def __bmp_polygon(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    label_map: Dict[int, str],
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    scale_factor: Optional[int] = 1,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation json from a BMP with multiple labels.

    Vectorizes and simplifies contours per label.

    Args:
        input_urlpath (string): url/path to bmp file
        label_map (dict[int,str]): map of label number to label name
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict[str,str], optional): line color map with {feature name:rgb values}
        fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values}
        scale_factor (int, optional): scale to match image DSA.
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: DSA annotation
    """
    elements = []
    Image.MAX_IMAGE_PIXELS = 5000000000
    with open(input_urlpath, **storage_options).open() as of:
        annotation = Image.open(of)
    arr = np.array(annotation)

    for label_num, label_name in label_map.items():
        simplified_contours = vectorize_np_array_bitmask_by_pixel_value(
            arr, label_num, scale_factor=scale_factor
        )

        for n, contour in enumerate(simplified_contours):
            element = copy.deepcopy(base_dsa_polygon_element)
            element["label"]["value"] = label_name
            if fill_colors and label_name in fill_colors:
                element["fillColor"] = fill_colors[label_name]
            if line_colors and label_name in line_colors:
                element["lineColor"] = line_colors[label_name]

            coords = contour.tolist()
            for c in coords:
                c.append(0)
            element["points"] = coords
            elements.append(element)

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        storage_options,
    )

__heatmap(input_urlpath, output_urlpath, image_filename, annotation_name, column, tile_size, scale_factor=None, fill_colors=None, line_colors=None, storage_options={}, output_storage_options={})

Generate heatmap based on the tile scores

Creates a heatmap for the given column, using the color palette viridis to set a fill value - the color ranges from purple to yellow, for scores from 0 to 1.

Parameters:

Name Type Description Default
input_urlpath string

url/path to parquet with tile scores

required
annotation_name string

name of the annotation to be displayed in DSA

required
column list[string]

columns to visualize e.g. tile_score

required
tile_size int

size of tiles

required
scale_factor int

scale to match the image on DSA.

None
fill_colors Optional[dict[str, str]]

fill color map with {feature name:rgba values}

None
line_colors Optional[dict[str, str]]

line color map with {feature name:rgb values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

DSA annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def __heatmap(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    column: List[str],
    tile_size: int,
    scale_factor: Optional[int] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    line_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Generate heatmap based on the tile scores

    Creates a heatmap for the given column, using the color palette `viridis`
    to set a fill value
    - the color ranges from purple to yellow, for scores from 0 to 1.

    Args:
        input_urlpath (string): url/path to parquet with tile scores
        annotation_name (string): name of the annotation to be displayed in DSA
        column (list[string]): columns to visualize e.g. tile_score
        tile_size (int): size of tiles
        scale_factor (int, optional): scale to match the image on DSA.
        fill_colors (Optional[dict[str,str]]): fill color map with {feature name:rgba values}
        line_colors (Optional[dict[str,str]]): line color map with {feature name:rgb values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: DSA annotation
    """
    if type(column) == str:
        column = [column]

    with open(input_urlpath, **storage_options) as of:
        df = pd.read_parquet(of).reset_index()
    scaled_tile_size = int(tile_size * int(scale_factor if scale_factor else 1))

    elements = []
    for _, row in df.iterrows():
        element = copy.deepcopy(base_dsa_polygon_element)

        # get label specific color and add to elements
        if len(column) == 1:
            label = row[column[0]]
            element["label"]["value"] = str(label)
        else:
            label = pd.to_numeric(row[column]).idxmax()
            element["label"]["value"] = str(label)

        if fill_colors and label in fill_colors:
            element["fillColor"] = fill_colors[label]
        if line_colors and label in line_colors:
            element["lineColor"] = line_colors[label]

        # convert coordinate string to tuple using eval
        x, y = address_to_coord(row["address"])

        pixel_x = x * scaled_tile_size
        pixel_y = y * scaled_tile_size

        coords = [
            [pixel_x, pixel_y],
            [pixel_x + scaled_tile_size, pixel_y],
            [pixel_x + scaled_tile_size, pixel_y + scaled_tile_size],
            [pixel_x, pixel_y + scaled_tile_size],
            [pixel_x, pixel_y],
        ]
        for c in coords:
            c.append(0)
        element["points"] = coords
        elements.append(element)

    if len(column) == 1:
        annotation_name = column[0] + "_" + annotation_name

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

__qupath_polygon(input_urlpath, output_urlpath, image_filename, annotation_name, classes_to_include, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={})

Build DSA annotation json from Qupath polygon geojson

Parameters:

Name Type Description Default
input_urlpath string

url/path of Qupath polygon geojson

required
annotation_name string

name of the annotation to be displayed in DSA

required
classes_to_include list

list of classification labels to visualize

required
line_colors map

line color map with {feature name:rgb values}

None
fill_colors map

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

dsa annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def __qupath_polygon(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    classes_to_include: List,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation json from Qupath polygon geojson

    Args:
        input_urlpath (string): url/path of Qupath polygon geojson
        annotation_name (string): name of the annotation to be displayed in DSA
        classes_to_include (list): list of classification labels to visualize
        e.g. ["Tumor", "Stroma", ...]
        line_colors (map, optional): line color map with {feature name:rgb values}
        fill_colors (map, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: dsa annotation
    """
    regional_file = open(input_urlpath, "r", **storage_options)
    with regional_file.open() as of:
        pixel_clf_polygons = geojson.load(of)

    feature_iter = iter(pixel_clf_polygons)
    if type(pixel_clf_polygons) == geojson.feature.FeatureCollection:
        feature_iter = iter(pixel_clf_polygons.features)

    elements = []
    for polygon in feature_iter:
        props = polygon.properties
        if "classification" not in props:
            continue

        label_name = polygon.properties["classification"]["name"]
        if label_name in classes_to_include:
            element = copy.deepcopy(base_dsa_polygon_element)
            element["label"]["value"] = label_name
            if fill_colors and label_name in fill_colors:
                element["fillColor"] = fill_colors[label_name]
            if line_colors and label_name in line_colors:
                element["lineColor"] = line_colors[label_name]

            coords = polygon["geometry"]["coordinates"]

            # uneven nesting of connected components
            for coord in coords:
                if isinstance(coord[0], list) and isinstance(coord[0][0], (int, float)):
                    for c in coord:
                        c.append(0)
                    element["points"] = coord
                    elements.append(element)
                else:
                    for i in range(len(coord)):
                        connected_component_coords = coord[i]
                        connected_component_element = copy.deepcopy(element)
                        for c in connected_component_coords:
                            c.append(0)

                        connected_component_element[
                            "points"
                        ] = connected_component_coords
                        elements.append(connected_component_element)
    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

__regional_polygon(input_urlpath, output_urlpath, image_filename, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={})

Build DSA annotation json from regional annotation geojson

Parameters:

Name Type Description Default
input string

path to regional annotation geojson

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read/write functions

{}

Returns:

Name Type Description
dict

DSA annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def __regional_polygon(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation json from regional annotation geojson

    Args:
        input (string): path to regional annotation geojson
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read/write functions

    Returns:
        dict: DSA annotation
    """
    with open(input_urlpath, **storage_options).open() as regional_file:
        regional_annotation = geojson.loads(geojson.load(regional_file))

    elements = []
    for annot in regional_annotation["features"]:
        # get label name and add to element
        element = copy.deepcopy(base_dsa_polygon_element)
        label_name = annot.properties["label_name"]
        element["label"]["value"] = label_name
        if fill_colors and label_name in fill_colors:
            element["fillColor"] = fill_colors[label_name]
        if line_colors and label_name in line_colors:
            element["lineColor"] = line_colors[label_name]

        # add coordinates
        coords = annot["geometry"]["coordinates"]
        # if coordinates have extra nesting, set coordinates to 2d array.
        coords_arr = np.array(coords)
        if coords_arr.ndim == 3 and coords_arr.shape[0] == 1:
            coords = np.squeeze(coords_arr).tolist()

        for c in coords:
            c.append(0)
        element["points"] = coords
        elements.append(element)

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

__stardist_cell(input_urlpath, output_urlpath, image_filename, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={})

Build DSA annotation json from TSV classification data generated by stardist

Processes a cell classification data generated by Qupath/stardist and adds the center coordinates of the cells as annotation elements.

Parameters:

Name Type Description Default
input_urlpath string

url/path to TSV classification data generated by stardist

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read/write functions

{}

Returns:

Name Type Description
dict

dsa annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def __stardist_cell(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
):
    """Build DSA annotation json from TSV classification data generated by
    stardist

    Processes a cell classification data generated by Qupath/stardist and
    adds the center coordinates of the cells
    as annotation elements.

    Args:
        input_urlpath (string): url/path to TSV classification data generated by stardist
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read/write functions

    Returns:
        dict: dsa annotation
    """
    # qupath_stardist_cell_tsv can be quite large to load all columns
    # into memory (contains many feature columns),
    # so only load baisc columns that are needed for now
    cols_to_load = [
        "Image",
        "Name",
        "Class",
        "Centroid X µm",
        "Centroid Y µm",
    ]
    df = pd.read_csv(
        input_urlpath,
        sep="\t",
        usecols=cols_to_load,
        index_col=False,
        storage_options=storage_options,
    )

    # do some preprocessing on the tsv -- e.g. stardist sometimes finds
    # cells in glass
    # df = df[df["Parent"] != "Glass"]
    df = df.dropna(subset=["Centroid X µm", "Centroid Y µm"])
    # populate json elements
    elements = []
    for idx, row in df.iterrows():
        elements_entry = copy.deepcopy(base_dsa_point_element)

        # x,y coordinates from stardist are in microns so divide by
        # QUPATH_MAG_FACTOR = 0.5011 (exact 20x mag factor used by qupath
        # specifically)
        x = row["Centroid X µm"] / QUPATH_MAG_FACTOR
        y = row["Centroid Y µm"] / QUPATH_MAG_FACTOR

        # Get cell label and add to element
        label_name = row["Class"]
        elements_entry["label"]["value"] = label_name
        if fill_colors and label_name in fill_colors:
            elements_entry["fillColor"] = fill_colors[label_name]
        if line_colors and label_name in line_colors:
            elements_entry["lineColor"] = line_colors[label_name]

        # add centroid coordinate of cell to element
        center = [x, y, 0]
        elements_entry["center"] = center

        elements.append(elements_entry)

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

__stardist_polygon(input_urlpath, output_urlpath, image_filename, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={})

Build DSA annotation from stardist geojson classification results

Parameters:

Name Type Description Default
input_urlpath string

URL/path to stardist geojson classification results

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict[str, str]

user-provided line color map with {feature name:rgb values}

None
fill_colors dict[str, str]

user-provided fill color map with {feature name:rgba values}

None

Returns:

Type Description

dict[str,str]: annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
def __stardist_polygon(
    input_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation from stardist geojson classification results

    Args:
        input_urlpath (string): URL/path to stardist geojson classification results
        json
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict[str,str]): user-provided line color map with {feature name:rgb values}
        fill_colors (dict[str,str]): user-provided fill color map with {feature name:rgba values}

    Returns:
        dict[str,str]: annotation file path
    """
    # TODO: find better fix
    # can't handle NaNs for vectors, do this to replace all NaNs
    # for now: https://stackoverflow.com/questions/17140886/how-to-search
    # -and-replace-text-in-a-file
    with open(input_urlpath, "r", **storage_options).open() as input_file:
        filedata = input_file.read()
    newdata = filedata.replace("NaN", "-1")

    elements = []
    for cell in ijson.items(newdata, "item"):
        label_name = cell["properties"]["classification"]["name"]
        coord_list = list(cell["geometry"]["coordinates"][0])

        # uneven nested list when iterative parsing of json --> make sure
        # to get the list of coords
        # this can come as mixed types as well, so type checking needed
        while (
            isinstance(coord_list, list)
            and isinstance(coord_list[0], list)
            and not isinstance(coord_list[0][0], (int, float, Decimal))
        ):
            coord_list = coord_list[0]

        coords = [[float(coord[0]), float(coord[1]), 0] for coord in coord_list]
        element = copy.deepcopy(base_dsa_polygon_element)

        element["label"]["value"] = str(label_name)
        if fill_colors and label_name in fill_colors:
            element["fillColor"] = fill_colors[label_name]
        if line_colors and label_name in line_colors:
            element["lineColor"] = line_colors[label_name]
        element["points"] = coords

        elements.append(element)

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

__stardist_polygon_tile(object_urlpath, tiles_urlpath, output_urlpath, image_filename, annotation_name_prefix, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={})

Build DSA annotation json from stardist geojson classification and labeled tiles

Parameters:

Name Type Description Default
object_urlpath string

URL/path to stardist geojson classification results

required
tiles_urlpath string

URL/path to tiles manifest parquet

required
output_urlpath string

URL/path prefix to save annotations

required
image_filename string

name of the image file in DSA e.g. 123.svs

required
annotation_name_prefix string

name of the annotation to be displayed in DSA

required
line_colors dict

user-provided line color map with {feature name:rgb values}

None
fill_colors dict

user-provided fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

DSA annotations

Source code in src/luna/pathology/cli/dsa_viz.py
def __stardist_polygon_tile(
    object_urlpath: str,
    tiles_urlpath: str,
    output_urlpath: str,
    image_filename: str,
    annotation_name_prefix: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation json from stardist geojson classification and labeled tiles

    Args:
        object_urlpath (string): URL/path to stardist geojson classification results
        tiles_urlpath (string): URL/path to tiles manifest parquet
        output_urlpath (string): URL/path prefix to save annotations
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name_prefix (string): name of the annotation to be displayed in DSA
        line_colors (dict): user-provided line color map with {feature name:rgb values}
        fill_colors (dict): user-provided fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: DSA annotations
    """
    with open(tiles_urlpath, **storage_options) as of:
        tiles_df = pd.read_parquet(of)
    LabeledTileSchema.validate(tiles_df.reset_index())
    logger.info(f"Read tiles manifest with {len(tiles_df)} tiles")

    with open(object_urlpath, **storage_options) as of:
        object_gdf = gpd.read_file(of)

    logger.info(f"Read {len(object_gdf)} stardist objects")

    ann_region_polygons = [
        box(
            row.x_coord,
            row.y_coord,
            row.x_coord + row.xy_extent,
            row.y_coord + row.xy_extent,
        )
        for _, row in tiles_df.iterrows()
    ]
    tiles_gdf = gpd.GeoDataFrame(
        data=tiles_df, geometry=ann_region_polygons, crs="EPSG:4326"
    )

    object_tiles = object_gdf.sjoin(tiles_gdf, how="left", predicate="within")
    logger.info("Spatially joined stardist objects with tiles manifest")
    tile_elements = {}
    for _, row in object_tiles.iterrows():
        tile_label = row["Classification"]
        if pd.isnull(tile_label):
            tile_label = "unclassified"

        if tile_label not in tile_elements.keys():
            tile_elements[tile_label] = []

        label_name = row["classification"]["name"]
        multipolygon = row["geometry"]
        if type(multipolygon) != MultiPolygon:
            multipolygon = MultiPolygon([multipolygon])
        for polygon in list(multipolygon.geoms):
            coord_list = list(polygon.exterior.coords)

            coords = [[float(coord[0]), float(coord[1]), 0] for coord in coord_list]
            element = copy.deepcopy(base_dsa_polygon_element)

            element["label"]["value"] = str(label_name)
            if fill_colors and label_name in fill_colors:
                element["fillColor"] = fill_colors[label_name]
            if line_colors and label_name in line_colors:
                element["lineColor"] = line_colors[label_name]
            element["points"] = coords

            tile_elements[tile_label].append(element)

    metadata = {}
    for tile_label, elements in tile_elements.items():
        dsa_annotation = get_dsa_annotation(
            elements, annotation_name_prefix + "_" + tile_label
        )
        annotation_filepath = save_dsa_annotation(
            dsa_annotation,
            output_urlpath,
            image_filename,
            output_storage_options,
        )
        metadata[tile_label] = annotation_filepath

    return metadata

bitmask_polygon(input_map, output_urlpath, image_filename, annotation_name, line_colors=None, fill_colors=None, scale_factor=1, storage_options={}, output_storage_options={})

Build DSA annotation json from bitmask PNGs

Vectorizes and simplifies contours from the bitmask.

Parameters:

Name Type Description Default
input map

map of {label:urlpath_to_bitmask_png}

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
scale_factor int

scale to match the image on DSA.

1
storage_options dict

storage options to pass to read/write functions

{}

Returns:

Name Type Description
dict

DSA annotation

Source code in src/luna/pathology/cli/dsa_viz.py
def bitmask_polygon(
    input_map: Dict[str, str],
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    scale_factor: Optional[int] = 1,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Build DSA annotation json from bitmask PNGs

    Vectorizes and simplifies contours from the bitmask.

    Args:
        input (map): map of {label:urlpath_to_bitmask_png}
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        scale_factor (int, optional): scale to match the image on DSA.
        storage_options (dict): storage options to pass to read/write functions

    Returns:
        dict: DSA annotation
    """
    if not check_filepaths_valid(input_map.values(), storage_options):
        raise ValueError("No valid PNG masks found. Exiting..")

    elements = []
    for bitmask_label, bitmask_filepath in input_map.items():
        Image.MAX_IMAGE_PIXELS = 5000000000
        with open(bitmask_filepath, "rb", **storage_options).open() as of:
            annotation = Image.open(of)
            bitmask_np = np.array(annotation)
        simplified_contours = vectorize_np_array_bitmask_by_pixel_value(
            bitmask_np, scale_factor=scale_factor
        )

        for n, contour in enumerate(simplified_contours):
            element = copy.deepcopy(base_dsa_polygon_element)
            label_name = bitmask_label
            element["label"]["value"] = label_name
            if fill_colors and label_name in fill_colors:
                element["fillColor"] = fill_colors[label_name]
            if line_colors and label_name in line_colors:
                element["lineColor"] = line_colors[label_name]

            coords = contour.tolist()
            for c in coords:
                c.append(0)
            element["points"] = coords
            elements.append(element)

    dsa_annotation = get_dsa_annotation(elements, annotation_name)
    return save_dsa_annotation(
        dsa_annotation,
        output_urlpath,
        image_filename,
        output_storage_options,
    )

bitmask_polygon_cli(input_map='???', output_urlpath='???', image_filename='???', annotation_name='???', line_colors=None, fill_colors=None, scale_factor=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from bitmask PNGs

Vectorizes and simplifies contours from the bitmask.

Parameters:

Name Type Description Default
input_map map

map of {label:path_to_bitmask_png}

'???'
output_urlpath string

url/path to save the DSA compatible annotation

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
scale_factor int

scale to match the image on DSA.

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config yaml file

''

Returns:

Name Type Description
dict

annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def bitmask_polygon_cli(
    input_map: Dict[str, str] = "???",  # type: ignore
    output_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    scale_factor: Optional[int] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from bitmask PNGs

    Vectorizes and simplifies contours from the bitmask.

    Args:
        input_map (map): map of {label:path_to_bitmask_png}
        output_urlpath (string): url/path to save the DSA compatible annotation
        json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        scale_factor (int, optional): scale to match the image on DSA.
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config yaml file

    Returns:
        dict: annotation file path
    """
    config = get_config(vars())
    annotation_filepath = bitmask_polygon(
        config["input_map"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["line_colors"],
        config["fill_colors"],
        config["scale_factor"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return {"dsa_annotation": annotation_filepath}

bmp_polygon(slide_manifest, output_urlpath, label_map, annotation_name, line_colors=None, fill_colors=None, scale_factor=1, storage_options={}, output_storage_options={}, annotation_column='bmp_polygon_url', output_column='bmp_polygon_dsa_url')

Build DSA annotation json from a BMP with multiple labels.

Vectorizes and simplifies contours per label.

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath string

url/path prefix to save the DSA compatible annotation

required
label_map dict[int, str]

map of label number to label name

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict[str, str]

line color map with {feature name:rgb values}

None
fill_colors dict[str, str]

fill color map with {feature name:rgba values}

None
scale_factor int

scale to match image DSA.

1
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to BMP polygon

'bmp_polygon_url'
output_column_suffix string

column suffix with result url to add to slide_manifest

required

Returns:

Name Type Description
dict

annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
def bmp_polygon(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    label_map: Dict[int, str],
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    scale_factor: Optional[int] = 1,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "bmp_polygon_url",
    output_column: str = "bmp_polygon_dsa_url",
):
    """Build DSA annotation json from a BMP with multiple labels.

    Vectorizes and simplifies contours per label.

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (string): url/path prefix to save the DSA compatible annotation
        json
        label_map (dict[int,str]): map of label number to label name
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict[str,str], optional): line color map with {feature name:rgb values}
        fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values}
        scale_factor (int, optional): scale to match image DSA.
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to BMP polygon
        output_column_suffix (string): column suffix with result url to add to slide_manifest

    Returns:
        dict: annotation file path
    """
    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __bmp_polygon,
            row[annotation_column],
            output_urlpath,
            image_filename,
            label_map,
            annotation_name,
            line_colors,
            fill_colors,
            scale_factor,
            storage_options,
            output_storage_options,
        )
        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    return slide_manifest.assign(**{output_column: dsa_annotation_urls})

bmp_polygon_cli(input_urlpath='???', output_urlpath='???', label_map='???', image_filename='???', annotation_name='???', line_colors=None, fill_colors=None, scale_factor=1, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from a BMP with multiple labels.

Vectorizes and simplifies contours per label.

Parameters:

Name Type Description Default
input_urlpath string

url/path to bmp file

'???'
output_urlpath string

url/path prefix to save the DSA compatible annotation

'???'
label_map dict[int, str]

map of label number to label name

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
line_colors dict[str, str]

line color map with {feature name:rgb values}

None
fill_colors dict[str, str]

fill color map with {feature name:rgba values}

None
scale_factor int

scale to match image DSA.

1
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def bmp_polygon_cli(
    input_urlpath: str = "???",
    output_urlpath: str = "???",
    label_map: Dict[int, str] = "???",  # type: ignore
    image_filename: str = "???",
    annotation_name: str = "???",
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    scale_factor: Optional[int] = 1,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from a BMP with multiple labels.

    Vectorizes and simplifies contours per label.

    Args:
        input_urlpath (string): url/path to bmp file
        output_urlpath (string): url/path prefix to save the DSA compatible annotation
        json
        label_map (dict[int,str]): map of label number to label name
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict[str,str], optional): line color map with {feature name:rgb values}
        fill_colors (dict[str,str], optional): fill color map with {feature name:rgba values}
        scale_factor (int, optional): scale to match image DSA.
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: annotation file path
    """
    config = get_config(vars())
    annotation_filepath = __bmp_polygon(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["label_map"],
        config["annotation_name"],
        config["line_colors"],
        config["fill_colors"],
        config["scale_factor"],
        config["storage_options"],
        config["output_storage_options"],
    )

    return {"dsa_annotation": annotation_filepath}

check_filepaths_valid(urls, storage_options)

Checks if all paths exist.

Parameters:

Name Type Description Default
filepaths list

file paths

required

Returns:

Name Type Description
bool

True if all file paths exist, False otherwise

Source code in src/luna/pathology/cli/dsa_viz.py
def check_filepaths_valid(urls, storage_options):
    """Checks if all paths exist.

    Args:
        filepaths (list): file paths

    Returns:
        bool: True if all file paths exist, False otherwise
    """

    all_files_found = True
    for url in urls:
        fs, urlpath = fsspec.core.url_to_fs(url, **storage_options)
        if not fs.exists(urlpath):
            logger.warning(f"url in config: {url} does not exist")
            all_files_found = False
    return all_files_found

get_dsa_annotation(elements, annotation_name, description='')

Helper function to get dsa annotation

Parameters:

Name Type Description Default
elements list

list of annotation elements

required
annotation_name string

annotation name for HistomicsUI

required
image_filename string

name of the image in DSA e.g. 123.svs

required

Returns:

Name Type Description
string

annotation file path. None if error in writing the file.

Source code in src/luna/pathology/cli/dsa_viz.py
def get_dsa_annotation(elements: list, annotation_name: str, description: str = ""):
    """Helper function to get dsa annotation

    Args:
        elements (list): list of annotation elements
        annotation_name (string): annotation name for HistomicsUI
        image_filename (string): name of the image in DSA e.g. 123.svs

    Returns:
        string: annotation file path. None if error in writing the file.
    """
    dsa_annotation = {
        "description": description,
        "elements": elements,
        "name": annotation_name,
    }

    dsa_annotation["elements"] = elements
    dsa_annotation["name"] = annotation_name

    return dsa_annotation

heatmap(slide_manifest, output_urlpath, annotation_name, column, tile_size, scale_factor=None, fill_colors=None, line_colors=None, output_column='', storage_options={}, output_storage_options={})

Generate heatmap based on the tile scores

Creates a heatmap for the given column, using the color palette viridis to set a fill value - the color ranges from purple to yellow, for scores from 0 to 1.

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath string

URL/path prefix to save the DSA compatible annotation

required
annotation_name string

name of the annotation to be displayed in DSA

required
column string

column to visualize e.g. tile_score

required
tile_size int

size of tiles

required
scale_factor int

scale to match the image on DSA.

None
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}

Returns:

Name Type Description
dict

annotation file path. None if error in writing the file.

Source code in src/luna/pathology/cli/dsa_viz.py
def heatmap(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    annotation_name: str,
    column: List[str],
    tile_size: int,
    scale_factor: Optional[int] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    line_colors: Optional[Dict[str, str]] = None,
    output_column: str = "",
    storage_options: Dict = {},
    output_storage_options: Dict = {},
):
    """Generate heatmap based on the tile scores

    Creates a heatmap for the given column, using the color palette `viridis`
    to set a fill value
    - the color ranges from purple to yellow, for scores from 0 to 1.

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (string): URL/path prefix to save the DSA compatible annotation
        json
        annotation_name (string): name of the annotation to be displayed in DSA
        column (string): column to visualize e.g. tile_score
        tile_size (int): size of tiles
        scale_factor (int, optional): scale to match the image on DSA.
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions

    Returns:
        dict: annotation file path. None if error in writing the file.
    """
    if not output_column:
        output_column = f"{annotation_name}_dsa_url"
    if "tiles_url" not in slide_manifest.columns:
        raise ValueError("tiles_url not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __heatmap,
            row["tiles_url"],
            output_urlpath,
            image_filename,
            annotation_name,
            column,
            tile_size,
            scale_factor,
            fill_colors,
            line_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    return slide_manifest.assign(**{output_column: dsa_annotation_urls})

heatmap_cli(input_urlpath='???', output_urlpath='???', image_filename='???', annotation_name='???', column='???', tile_size='???', scale_factor=1, fill_colors=None, line_colors=None, storage_options={}, output_storage_options={}, local_config='')

Generate heatmap based on the tile scores

Creates a heatmap for the given column, using the color palette viridis to set a fill value - the color ranges from purple to yellow, for scores from 0 to 1.

Parameters:

Name Type Description Default
input_urlpath string

URL/path to parquet with tile scores

'???'
output_urlpath string

URL/path prefix to save the DSA compatible annotation

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
column string

column to visualize e.g. tile_score

'???'
tile_size int

size of tiles

'???'
scale_factor int

scale to match the image on DSA.

1
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to write functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config yaml file

''

Returns:

Name Type Description
dict

annotation file path. None if error in writing the file.

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def heatmap_cli(
    input_urlpath: str = "???",
    output_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    column: str = "???",
    tile_size: int = "???",  # type: ignore
    scale_factor: Optional[int] = 1,
    fill_colors: Optional[dict[str, str]] = None,
    line_colors: Optional[dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Generate heatmap based on the tile scores

    Creates a heatmap for the given column, using the color palette `viridis`
    to set a fill value
    - the color ranges from purple to yellow, for scores from 0 to 1.

    Args:
        input_urlpath (string): URL/path to parquet with tile scores
        output_urlpath (string): URL/path prefix to save the DSA compatible annotation
        json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        column (string): column to visualize e.g. tile_score
        tile_size (int): size of tiles
        scale_factor (int, optional): scale to match the image on DSA.
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to write functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config yaml file

    Returns:
        dict: annotation file path. None if error in writing the file.
    """
    config = get_config(vars())
    annotation_filepath = __heatmap(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["column"],
        config["tile_size"],
        config["scale_factor"],
        config["fill_colors"],
        config["line_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return {"dsa_annotation": annotation_filepath}

qupath_polygon(slide_manifest, output_urlpath, image_filename, annotation_name, classes_to_include, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, annotation_column='', output_column='')

Build DSA annotation json from Qupath polygon geojson

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath string

URL/path prefix for saving the DSA compatible annotation

required
image_filename string

name of the image file in DSA e.g. 123.svs

required
annotation_name string

name of the annotation to be displayed in DSA

required
classes_to_include list

list of classification labels to visualize

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to qupath geojson

''
output_column_suffix string

column suffix with result url to add to slide_manifest

required

Returns:

Type Description

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/dsa_viz.py
def qupath_polygon(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    image_filename: str,
    annotation_name: str,
    classes_to_include: List,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "",
    output_column: str = "",
):
    """Build DSA annotation json from Qupath polygon geojson

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (string): URL/path prefix for saving the DSA compatible annotation
        json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        classes_to_include (list): list of classification labels to visualize
        e.g. ["Tumor", "Stroma", ...]
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to qupath geojson
        output_column_suffix (string): column suffix with result url to add to slide_manifest

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    if not annotation_column:
        annotation_column = f"{annotation_name}_geojson_url"
    if not output_column:
        output_column = f"{annotation_name}_dsa_url"
    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __qupath_polygon,
            row[annotation_column],
            output_urlpath,
            image_filename,
            annotation_name,
            classes_to_include,
            line_colors,
            fill_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    return slide_manifest.assign(**{output_column: dsa_annotation_urls})

qupath_polygon_cli(input_urlpath='???', output_urlpath='???', image_filename='???', annotation_name='???', classes_to_include='???', line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from Qupath polygon geojson

Parameters:

Name Type Description Default
input_urlpath string

URL/path of Qupath polygon geojson

'???'
output_urlpath string

URL/path prefix for saving the DSA compatible annotation

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
classes_to_include list

list of classification labels to visualize

'???'
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config yaml file

''

Returns:

Name Type Description
dict

annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def qupath_polygon_cli(
    input_urlpath: str = "???",
    output_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    classes_to_include: list = "???",  # type: ignore
    line_colors: Optional[dict[str, str]] = None,
    fill_colors: Optional[dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from Qupath polygon geojson

    Args:
        input_urlpath (string): URL/path of Qupath polygon geojson
        output_urlpath (string): URL/path prefix for saving the DSA compatible annotation
        json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        classes_to_include (list): list of classification labels to visualize
        e.g. ["Tumor", "Stroma", ...]
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config yaml file

    Returns:
        dict: annotation file path
    """
    config = get_config(vars())
    annotation_filepath = __qupath_polygon(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["classes_to_include"],
        config["line_colors"],
        config["fill_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )

    return {"dsa_annotation": annotation_filepath}

regional_polygon(slide_manifest, output_urlpath, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, annotation_column='', output_column='')

Build DSA annotation json from regional annotation geojson

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest

required
output_urlpath string

URL/path prefix for saving dsa annotation json

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to regional geojson

''
output_column_suffix string

column suffix with result url to add to slide_manifest

required

Returns:

Type Description

DataFrame[SlideSchema]: slide schema

Source code in src/luna/pathology/cli/dsa_viz.py
def regional_polygon(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "",
    output_column: str = "",
):
    """Build DSA annotation json from regional annotation geojson

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest
        output_urlpath (string): URL/path prefix for saving dsa annotation json
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to regional geojson
        output_column_suffix (string): column suffix with result url to add to slide_manifest

    Returns:
        DataFrame[SlideSchema]: slide schema
    """

    if not annotation_column:
        annotation_column = f"{annotation_name}_geojson_url"
    if not output_column:
        output_column = f"{annotation_name}_dsa_url"
    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __regional_polygon,
            row[annotation_column],
            output_urlpath,
            image_filename,
            annotation_name,
            fill_colors,
            line_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    return slide_manifest.assign(**{output_column: dsa_annotation_urls})

regional_polygon_cli(input_urlpath='???', output_urlpath='???', image_filename='???', annotation_name='???', line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from regional annotation geojson

Parameters:

Name Type Description Default
input_urlpath string

URL/path of to regional annotation geojson

'???'
output_urlpath string

URL/path prefix for saving dsa annotation json

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config yaml file

''

Returns:

Name Type Description
dict

annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def regional_polygon_cli(
    input_urlpath: str = "???",
    output_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    line_colors: Optional[dict[str, str]] = None,
    fill_colors: Optional[dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from regional annotation geojson

    Args:
        input_urlpath (string): URL/path of to regional annotation geojson
        output_urlpath (string): URL/path prefix for saving dsa annotation json
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config yaml file

    Returns:
        dict: annotation file path
    """

    config = get_config(vars())

    annotation_filepath = __regional_polygon(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["line_colors"],
        config["fill_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )

    return {"dsa_annotation": annotation_filepath}

save_dsa_annotation(dsa_annotation, output_urlpath, image_filename, storage_options={})

Helper function to save annotation elements to a json file.

Parameters:

Name Type Description Default
dsa_annotation dict

DSA annotations

required
output_urlpath string

url/path to a directory to save the annotation file

required
image_filename string

name of the image in DSA e.g. 123.svs

required
storage_options dict

options for storage functions

{}

Returns:

Name Type Description
string

annotation file path. None if error in writing the file.

Source code in src/luna/pathology/cli/dsa_viz.py
def save_dsa_annotation(
    dsa_annotation: dict,
    output_urlpath: str,
    image_filename: str,
    storage_options: dict = {},
):
    """Helper function to save annotation elements to a json file.

    Args:
        dsa_annotation (dict): DSA annotations
        output_urlpath (string): url/path to a directory to save the annotation file
        image_filename (string): name of the image in DSA e.g. 123.svs
        storage_options (dict): options for storage functions

    Returns:
        string: annotation file path. None if error in writing the file.
    """

    result = re.search(image_id_regex, image_filename)
    if result:
        image_id = result.group(1)
    else:
        raise InvalidImageIdException(f"Invalid image filename: {image_filename}")

    annotation_name_replaced = dsa_annotation["name"].replace(" ", "_")

    fs, output_urlpath_prefix = fsspec.core.url_to_fs(output_urlpath, **storage_options)
    output_path = (
        Path(output_urlpath_prefix) / f"{annotation_name_replaced}_{image_id}.json"
    )

    if not fs.exists(output_urlpath_prefix):
        fs.mkdir(output_urlpath_prefix)

    with fs.open(output_path, "w") as outfile:
        json.dump(dsa_annotation, outfile)
    logger.info(
        f"Saved {len(dsa_annotation['elements'])} to {fs.unstrip_protocol(str(output_path))}"
    )
    return fs.unstrip_protocol(str(output_path))

stardist_cell(slide_manifest, output_urlpath, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, annotation_column='', output_column='')

Build DSA annotation json from TSV classification data generated by stardist

Processes a cell classification data generated by Qupath/stardist and adds the center coordinates of the cells as annotation elements.

Parameters:

Name Type Description Default
input_urlpath string

URL/path to TSV classification data generated by stardist

required
output_urlpath string

URL/path prefix for saving dsa annotation json

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to stardist polygon geojson

''
output_column_suffix string

column suffix with result url to add to slide_manifest

required

Returns:

Type Description

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/dsa_viz.py
def stardist_cell(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "",
    output_column: str = "",
):
    """Build DSA annotation json from TSV classification data generated by
    stardist

    Processes a cell classification data generated by Qupath/stardist and
    adds the center coordinates of the cells
    as annotation elements.

    Args:
        input_urlpath (string): URL/path to TSV classification data generated by stardist
        output_urlpath (string): URL/path prefix for saving dsa annotation json
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to stardist polygon geojson
        output_column_suffix (string): column suffix with result url to add to slide_manifest

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    if not annotation_column:
        annotation_column = f"{annotation_name}_tsv_url"
    if not output_column:
        output_column = f"{annotation_name}_dsa_url"
    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __stardist_cell,
            row[annotation_column],
            output_urlpath,
            image_filename,
            annotation_name,
            line_colors,
            fill_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    return slide_manifest.assign(**{output_column: dsa_annotation_urls})

stardist_cell_cli(input_urlpath='???', output_urlpath='???', image_filename='???', annotation_name='???', line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from TSV classification data generated by stardist

Processes a cell classification data generated by Qupath/stardist and adds the center coordinates of the cells as annotation elements.

Parameters:

Name Type Description Default
input_urlpath string

URL/path to TSV classification data generated by stardist

'???'
output_urlpath string

URL/path prefix for saving dsa annotation json

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
line_colors dict

line color map with {feature name:rgb values}

None
fill_colors dict

fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config YAML file

''

Returns:

Type Description

dict[str,str]: annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def stardist_cell_cli(
    input_urlpath: str = "???",
    output_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    line_colors: Optional[dict[str, str]] = None,
    fill_colors: Optional[dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from TSV classification data generated by
    stardist

    Processes a cell classification data generated by Qupath/stardist and
    adds the center coordinates of the cells
    as annotation elements.

    Args:
        input_urlpath (string): URL/path to TSV classification data generated by stardist
        output_urlpath (string): URL/path prefix for saving dsa annotation json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config YAML file

    Returns:
        dict[str,str]: annotation file path
    """
    config = get_config(vars())
    annotation_filepath = __stardist_cell(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["line_colors"],
        config["fill_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return {"dsa_annotation": annotation_filepath}

stardist_polygon(slide_manifest, output_urlpath, annotation_name, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, annotation_column='', output_column='')

Build DSA annotation json from stardist geojson classification results

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath string

URL/path prefix to save annotations

required
annotation_name string

name of the annotation to be displayed in DSA

required
line_colors dict

user-provided line color map with {feature name:rgb values}

None
fill_colors dict

user-provided fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to stardist polygon geojson

''
output_column string

column with result url to add to slide_manifest

''

Returns:

Type Description

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/dsa_viz.py
def stardist_polygon(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    annotation_name: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "",
    output_column: str = "",
):
    """Build DSA annotation json from stardist geojson classification results

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (string): URL/path prefix to save annotations
        annotation_name (string): name of the annotation to be displayed in DSA
        line_colors (dict): user-provided line color map with {feature name:rgb values}
        fill_colors (dict): user-provided fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to stardist polygon geojson
        output_column (string): column with result url to add to slide_manifest

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    if not annotation_column:
        annotation_column = f"{annotation_name}_geojson_url"
    if not output_column:
        output_column = f"{annotation_name}_dsa_url"

    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __stardist_polygon,
            row[annotation_column],
            output_urlpath,
            image_filename,
            annotation_name,
            line_colors,
            fill_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_urls = client.gather(futures)
    for idx, dsa_annotation_url in enumerate(dsa_annotation_urls):
        slide_manifest.at[idx, output_column] = dsa_annotation_url

    return slide_manifest

stardist_polygon_cli(input_urlpath='???', image_filename='???', annotation_name='???', output_urlpath='???', line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from stardist geojson classification results

Parameters:

Name Type Description Default
input_urlpath string

URL/path to stardist geojson classification results json

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name string

name of the annotation to be displayed in DSA

'???'
output_urlpath string

URL/path prefix to save annotations

'???'
line_colors dict

user-provided line color map with {feature name:rgb values}

None
fill_colors dict

user-provided fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read/write functions

{}
local_config string

local config YAML file

''

Returns:

Type Description

dict[str,str]: annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def stardist_polygon_cli(
    input_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name: str = "???",
    output_urlpath: str = "???",
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from stardist geojson classification results

    Args:
        input_urlpath (string): URL/path to stardist geojson classification results json
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name (string): name of the annotation to be displayed in DSA
        output_urlpath (string): URL/path prefix to save annotations
        line_colors (dict): user-provided line color map with {feature name:rgb values}
        fill_colors (dict): user-provided fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read/write functions
        local_config (string): local config YAML file

    Returns:
        dict[str,str]: annotation file path
    """
    config = get_config(vars())
    annotation_filepath = __stardist_polygon(
        config["input_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name"],
        config["line_colors"],
        config["fill_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return {"dsa_annotation": annotation_filepath}

stardist_polygon_tile(slide_manifest, output_urlpath, annotation_name_prefix, line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, annotation_column='', output_column_suffix='')

Build DSA annotation json from stardist geojson classification and labeled tiles

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest

required
annotation_name_prefix string

name of the annotation to be displayed in DSA

required
output_urlpath string

URL/path prefix to save annotations

required
line_colors dict

user-provided line color map with {feature name:rgb values}

None
fill_colors dict

user-provided fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
annotation_column string

column containing url to stardist polygon geojson

''
output_column_suffix string

column suffix with result url to add to slide_manifest

''

Returns:

Type Description

dict[str,str]: annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
def stardist_polygon_tile(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    annotation_name_prefix: str,
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: Dict = {},
    output_storage_options: Dict = {},
    annotation_column: str = "",
    output_column_suffix: str = "",
):
    """Build DSA annotation json from stardist geojson classification and labeled tiles

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest
        annotation_name_prefix (string): name of the annotation to be displayed in DSA
        output_urlpath (string): URL/path prefix to save annotations
        line_colors (dict): user-provided line color map with {feature name:rgb values}
        fill_colors (dict): user-provided fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        annotation_column (string): column containing url to stardist polygon geojson
        output_column_suffix (string): column suffix with result url to add to slide_manifest

    Returns:
        dict[str,str]: annotation file path
    """
    if not annotation_column:
        annotation_column = f"{annotation_name_prefix}_geojson_url"
    if not output_column_suffix:
        output_column_suffix = f"{annotation_name_prefix}_dsa_url"
    if annotation_column not in slide_manifest.columns:
        raise ValueError(f"{annotation_column} not found in slide manifest")
    client = get_or_create_dask_client()
    futures = []
    for _, row in slide_manifest.iterrows():
        image_filename = os.path.basename(row["url"])
        future = client.submit(
            __stardist_polygon_tile,
            row[annotation_column],
            row["tiles_url"],
            output_urlpath,
            image_filename,
            annotation_name_prefix,
            line_colors,
            fill_colors,
            storage_options,
            output_storage_options,
        )

        futures.append(future)
    progress(futures)
    dsa_annotation_url_maps = client.gather(futures)
    tile_labels = dsa_annotation_url_maps[0].keys()
    return slide_manifest.assign(
        **{
            f"{tile_label}_{output_column_suffix}": [
                x[tile_label] for x in dsa_annotation_url_maps
            ]
            for tile_label in tile_labels
        }
    )

stardist_polygon_tile_cli(object_urlpath='???', tiles_urlpath='???', image_filename='???', annotation_name_prefix='???', output_urlpath='???', line_colors=None, fill_colors=None, storage_options={}, output_storage_options={}, local_config='')

Build DSA annotation json from stardist geojson classification and labeled tiles

Parameters:

Name Type Description Default
object_urlpath string

URL/path to object geojson classification results

'???'
tiles_urlpath string

URL/path to tiles manifest parquet

'???'
image_filename string

name of the image file in DSA e.g. 123.svs

'???'
annotation_name_prefix string

name of the annotation to be displayed in DSA

'???'
output_urlpath string

URL/path prefix to save annotations

'???'
line_colors dict

user-provided line color map with {feature name:rgb values}

None
fill_colors dict

user-provided fill color map with {feature name:rgba values}

None
storage_options dict

storage options to pass to read functions

{}
output_storage_options dict

storage options to pass to write functions

{}
local_config string

local config YAML file

''

Returns:

Type Description

dict[str,str]: annotation file path

Source code in src/luna/pathology/cli/dsa_viz.py
@timed
@save_metadata
def stardist_polygon_tile_cli(
    object_urlpath: str = "???",
    tiles_urlpath: str = "???",
    image_filename: str = "???",
    annotation_name_prefix: str = "???",
    output_urlpath: str = "???",
    line_colors: Optional[Dict[str, str]] = None,
    fill_colors: Optional[Dict[str, str]] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Build DSA annotation json from stardist geojson classification and labeled tiles

    Args:
        object_urlpath (string): URL/path to object geojson classification results
        tiles_urlpath (string): URL/path to tiles manifest parquet
        image_filename (string): name of the image file in DSA e.g. 123.svs
        annotation_name_prefix (string): name of the annotation to be displayed in DSA
        output_urlpath (string): URL/path prefix to save annotations
        line_colors (dict): user-provided line color map with {feature name:rgb values}
        fill_colors (dict): user-provided fill color map with {feature name:rgba values}
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (string): local config YAML file

    Returns:
        dict[str,str]: annotation file path
    """
    config = get_config(vars())
    metadata = __stardist_polygon_tile(
        config["object_urlpath"],
        config["tiles_urlpath"],
        config["output_urlpath"],
        config["image_filename"],
        config["annotation_name_prefix"],
        config["line_colors"],
        config["fill_colors"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return metadata

extract_kfunction_statistics

cli(input_cell_objects_urlpath='???', tile_size='???', intensity_label='???', tile_stride='???', radius='???', output_urlpath='.', storage_options={}, output_storage_options={}, local_config='')

Run k function using a sliding window approach, where the k-function is computed locally in a smaller window, and aggregated across the entire slide.

Parameters:

Name Type Description Default
input_cell_objects_urlpath str

url/path to cell objects (.csv)

'???'
tile_size int

size of tiles to use (at the requested magnification)

'???'
tile_stride int

spacing between tiles

'???'
intensity_label str

Columns of cell object to use for intensity calculations (for I-K function - spatial + some scalar value clustering)

'???'
radius float

the radius to consider

'???'
output_urlpath str

output URL/path prefix

'.'
storage_options dict

storage options for reading the cell objects

{}

Returns:

Type Description

pd.DataFrame: metadata about function call

Source code in src/luna/pathology/cli/extract_kfunction_statistics.py
@timed
@save_metadata
def cli(
    input_cell_objects_urlpath: str = "???",
    tile_size: int = "???",  # type: ignore
    intensity_label: str = "???",
    tile_stride: int = "???",  # type: ignore
    radius: float = "???",  # type: ignore
    output_urlpath: str = ".",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Run k function using a sliding window approach, where the k-function is computed locally in a smaller window, and aggregated across the entire slide.

    Args:
        input_cell_objects_urlpath (str): url/path to cell objects (.csv)
        tile_size (int): size of tiles to use (at the requested magnification)
        tile_stride (int): spacing between tiles
        intensity_label (str): Columns of cell object to use for intensity calculations (for I-K function - spatial + some scalar value clustering)
        radius (float):  the radius to consider
        output_urlpath (str): output URL/path prefix
        storage_options (dict): storage options for reading the cell objects

    Returns:
        pd.DataFrame: metadata about function call
    """
    config = get_config(vars())

    configure_dask_client()

    df_stats = extract_kfunction(
        config["input_cell_objects_urlpath"],
        config["tile_size"],
        config["intensity_label"],
        config["tile_stride"],
        config["radius"],
        config["storage_options"],
    )
    fs, output_urlpath_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )
    output_tile_header = Path(output_urlpath_prefix) / (
        str(Path(config["input_cell_objects_urlpath"]).stem)
        + "_kfunction_supertiles.parquet"
    )
    with fs.open(output_tile_header, "wb") as of:
        df_stats.to_parquet(of)

    properties = {
        "slide_tiles": str(output_tile_header),
    }

    return properties

extract_kfunction(input_cell_objects_urlpath, tile_size, intensity_label, tile_stride, radius, storage_options={})

Run k function using a sliding window approach, where the k-function is computed locally in a smaller window, and aggregated across the entire slide.

Parameters:

Name Type Description Default
input_cell_objects str

URL/path to cell objects (.csv)

required
tile_size int

size of tiles to use (at the requested magnification)

required
intensity_label str

Columns of cell object to use for intensity calculations (for I-K function - spatial + some scalar value clustering)

required
tile_stride int

spacing between tiles

required
radius float

the radius to consider

required
storage_options dict

storage options for reading the cell objects

{}

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/extract_kfunction_statistics.py
def extract_kfunction(
    input_cell_objects_urlpath: str,
    tile_size: int,
    intensity_label: str,
    tile_stride: int,
    radius: float,
    storage_options: dict = {},
):
    """Run k function using a sliding window approach, where the k-function is computed locally in a smaller window, and aggregated across the entire slide.

    Args:
        input_cell_objects (str): URL/path to cell objects (.csv)
        tile_size (int): size of tiles to use (at the requested magnification)
        intensity_label (str): Columns of cell object to use for intensity calculations (for I-K function - spatial + some scalar value clustering)
        tile_stride (int): spacing between tiles
        radius (float):  the radius to consider
        storage_options (dict): storage options for reading the cell objects

    Returns:
        dict: metadata about function call
    """
    client = get_or_create_dask_client()
    df = pd.read_parquet(input_cell_objects_urlpath, storage_options=storage_options)

    l_address = []
    l_k_function_futures = []
    l_x_coord = []
    l_y_coord = []

    feature_name = (
        f"ikfunction_r{radius}_stain{intensity_label.replace(' ','_').replace(':','')}"
    )

    coords = product(
        range(int(df["x_coord"].min()), int(df["x_coord"].max()), tile_stride),
        range(int(df["y_coord"].min()), int(df["y_coord"].max()), tile_stride),
    )

    logger.info("Submitting tasks...")
    for x, y in coords:
        df_tile = df.query(
            f"x_coord >= {x} and x_coord <= {x+tile_size} and y_coord >={y} and y_coord <= {y+tile_size}"
        )

        if len(df_tile) < 3:
            continue

        future = client.submit(
            Kfunction,
            df_tile[["x_coord", "y_coord"]],
            df_tile[["x_coord", "y_coord"]],
            intensity=np.array(df_tile[intensity_label]),
            radius=radius,
            count=True,
        )

        l_address.append(coord_to_address((x, y), 0))
        l_k_function_futures.append(future)
        l_x_coord.append(x)
        l_y_coord.append(y)
    logger.info("Waiting for all tasks to complete...")
    progress(l_k_function_futures)
    l_k_function = client.gather(l_k_function_futures)

    df_stats = pd.DataFrame(
        {
            "address": l_address,
            "x_coord": l_x_coord,
            "y_coord": l_y_coord,
            "results": l_k_function,
        }
    ).set_index("address")
    df_stats.loc[:, "xy_extent"] = tile_size
    df_stats.loc[:, "tile_size"] = tile_size  # Same, 1 to 1
    df_stats.loc[:, "tile_units"] = "um"  # Same, 1 to 1

    df_stats[feature_name] = df_stats["results"].apply(lambda x: x["intensity"])
    df_stats[feature_name + "_norm"] = (
        df_stats[feature_name] / df_stats[feature_name].max()
    )

    df_stats = df_stats.drop(columns=["results"]).dropna()

    logger.info("Generated k-function feature data:")
    logger.info(df_stats)

    return df_stats

extract_shape_features

cli(slide_mask_urlpath='???', label_cols='???', output_urlpath='???', include_smaller_regions=False, storage_options={}, output_storage_options={}, local_config='')

Extracts shape and spatial features (HIF features) from a slide mask. This CLI extracts two sets of features. The first set are 'whole slide features', where the entire mask label is considred as a single region and features are extracted. These features are useful for determining things like total area of x tissue.

The second set of features are 'regional features', where each label is split up according to their connectivity and features are extracted from these smaller regions. These features are useful for determining things like solidity of the top ten largest regions of tissue y. Pixel intensity values from the WSI are unused. In order to generate connected regions, skimage generates a mask itself where different values coorespond to different regions, which removes the tissue type information from the original mask. So, the original mask is passed as an intensity image to ensure that each region can be associated with a tissue type.

Args: slide_mask_urlpath (str): URL/path to slide mask (*.tif) label_cols (List[str]): list of labels that coorespond to those in slide_mask_urlpath output_urlpath (str): output URL/path prefix include_smaller_regions (bool): include the smaller regions (not just larget) storage_options (dict): storage options to pass to read functions output_storage_options (dict): storage options to pass to write functions local_config (str): local config YAML file

Returns:

Name Type Description
dict

output .tif path and the number of shapes for which features were generated

Source code in src/luna/pathology/cli/extract_shape_features.py
@timed
@save_metadata
def cli(
    slide_mask_urlpath: str = "???",
    label_cols: List[str] = "???",  # type: ignore
    output_urlpath: str = "???",  # type: ignore
    include_smaller_regions: bool = False,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Extracts shape and spatial features (HIF features) from a slide mask.
    This CLI extracts two sets of features. The first set are 'whole slide features', where
    the entire mask label is considred as a single region and features are extracted. These features
    are useful for determining things like total area of x tissue.

    The second set of features are 'regional features', where each label is split up according to
    their connectivity and features are extracted from these smaller regions.
    These features are useful for determining things like solidity of the top ten largest
    regions of tissue y. Pixel intensity values from the WSI are unused. In order to generate
    connected regions, skimage generates a mask itself where different values coorespond
    to different regions, which removes the tissue type information from the original mask.
    So, the original mask is passed as an intensity image to ensure that each region can be
    associated with a tissue type.

     Args:
        slide_mask_urlpath (str): URL/path to slide mask (*.tif)
        label_cols (List[str]): list of labels that coorespond to those in slide_mask_urlpath
        output_urlpath (str): output URL/path prefix
        include_smaller_regions (bool): include the smaller regions (not just larget)
        storage_options (dict): storage options to pass to read functions
        output_storage_options (dict): storage options to pass to write functions
        local_config (str): local config YAML file

    Returns:
        dict: output .tif path and the number of shapes for which features were generated

    """
    config = get_config(vars())

    with open(config["slide_mask_urlpath"], "rb", **config["storage_options"]) as of:
        mask = tifffile.imread(of)

    mask_values = {k: v + 1 for v, k in enumerate(config["label_cols"])}
    result_df = extract_shape_features(
        mask, mask_values, config["include_smaller_regions"]
    )

    fs, urlpath = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )

    output_fpath = Path(urlpath) / "shape_features.csv"
    with fs.open(output_fpath, "w") as of:
        result_df.to_csv(of)

    properties = {"shape_features": output_fpath, "num_shapes": len(result_df)}

    logger.info(properties)
    return properties

extract_shape_features(mask, mask_values, include_smaller_regions=False, properties=['area', 'bbox', 'bbox_area', 'centroid', 'convex_area', 'convex_image', 'coords', 'eccentricity', 'equivalent_diameter', 'euler_number', 'extent', 'filled_area', 'filled_image', 'image', 'inertia_tensor', 'inertia_tensor_eigvals', 'label', 'local_centroid', 'major_axis_length', 'minor_axis_length', 'moments', 'moments_central', 'moments_hu', 'moments_normalized', 'orientation', 'perimeter', 'slice', 'solidity'])

Extracts shape and spatial features (HIF features) from a slide mask

Args: slide_mask_urlpath (str): url/path to slide mask (*.tif) label_cols (List[str]): list of labels that coorespond to those in slide_mask_urlpath

Returns:

Type Description

pd.DataFrame: shape and spatial features

Source code in src/luna/pathology/cli/extract_shape_features.py
def extract_shape_features(
    mask: np.ndarray,
    mask_values: Dict[int, str],
    include_smaller_regions=False,
    properties: List[str] = [
        "area",
        "bbox",
        "bbox_area",
        "centroid",
        "convex_area",
        "convex_image",
        "coords",
        "eccentricity",
        "equivalent_diameter",
        "euler_number",
        "extent",
        "filled_area",
        "filled_image",
        "image",
        "inertia_tensor",
        "inertia_tensor_eigvals",
        "label",
        "local_centroid",
        "major_axis_length",
        "minor_axis_length",
        "moments",
        "moments_central",
        "moments_hu",
        "moments_normalized",
        "orientation",
        "perimeter",
        "slice",
        "solidity",
    ],
):
    """Extracts shape and spatial features (HIF features) from a slide mask

     Args:
        slide_mask_urlpath (str): url/path to slide mask (*.tif)
        label_cols (List[str]): list of labels that coorespond to those in slide_mask_urlpath

    Returns:
        pd.DataFrame: shape and spatial features
    """

    logger.info(f"Mask shape={mask.shape}")

    logger.info("Extracting regional features based on connectivity")
    whole_slide_features_df = extract_whole_slide_features(
        mask, mask_values, properties
    )
    whole_slide_features_df["Parent"] = "whole_region"
    whole_slide_features_df = whole_slide_features_df.set_index("Class")
    whole_slide_features_df["area_fraction"] = (
        whole_slide_features_df["area"] / whole_slide_features_df["area"].sum()
    )
    whole_slide_features_mdf = pd.melt(
        whole_slide_features_df.reset_index(), id_vars=["Parent", "Class"]
    )

    area_col = whole_slide_features_df.columns.get_loc("area")
    idx0, idx1 = np.triu_indices(len(whole_slide_features_df), 1)
    np.seterr(divide="ignore")
    whole_slide_ratio_df = pd.DataFrame(
        data={
            "Parent": "whole_region",
            "variable": np.array(
                [
                    f"area_log_ratio_to_{row}"
                    for row in whole_slide_features_df.index.values
                ]
            )[idx1],
            "value": np.log(whole_slide_features_df.iloc[idx0, area_col].values)
            - np.log(whole_slide_features_df.iloc[idx1, area_col].values),
        },
        index=whole_slide_features_df.index[idx0],
    )
    whole_slide_ratio_df = whole_slide_ratio_df.reset_index()

    regional_features_df = extract_regional_features(
        mask, mask_values, properties + ["min_intensity", "max_intensity"]
    )
    regional_features_df = regional_features_df.assign(
        Parent=[f"region_{x}" for x in range(len(regional_features_df))]
    )
    regional_features_df = regional_features_df.set_index(["Parent", "Class"])
    regional_features_df["area_fraction"] = (
        regional_features_df["area"] / whole_slide_features_df["area"]
    )
    regional_features_mdf = pd.melt(
        regional_features_df.reset_index(), id_vars=["Parent", "Class"]
    )

    regional_features_df = regional_features_df.reset_index()
    largest_regional_features_df = regional_features_df.loc[
        regional_features_df.groupby("Class")["area"].idxmax()
    ]
    largest_regional_features_df["Parent"] = "largest_region"
    largest_regional_features_df = largest_regional_features_df.set_index("Class")
    largest_regional_features_mdf = pd.melt(
        largest_regional_features_df.reset_index(), id_vars=["Parent", "Class"]
    )

    area_col = largest_regional_features_df.columns.get_loc("area")
    idx0, idx1 = np.triu_indices(len(largest_regional_features_df), 1)
    np.seterr(divide="ignore")
    ratio_df = pd.DataFrame(
        data={
            "Parent": "largest_region",
            "variable": np.array(
                [
                    f"area_log_ratio_to_{row}"
                    for row in largest_regional_features_df.index.values
                ]
            )[idx1],
            "value": np.log(largest_regional_features_df.iloc[idx0, area_col].values)
            - np.log(largest_regional_features_df.iloc[idx1, area_col].values),
        },
        index=largest_regional_features_df.index[idx0],
    )
    ratio_df = ratio_df.reset_index()

    result_df = pd.concat(
        [
            whole_slide_features_mdf,
            whole_slide_ratio_df,
            largest_regional_features_mdf,
            ratio_df,
        ]
    )

    if include_smaller_regions:
        result_df = pd.concat([result_df, regional_features_mdf])

    return result_df

extract_stain_texture

cli(slide_image_urlpath='???', slide_mask_urlpath='???', stain_sample_factor='???', stain_channel='???', tile_size='???', output_urlpath='.', storage_options={}, output_storage_options={}, local_config='')

Compute GLCM texture features on a de-convolved slide image

Parameters:

Name Type Description Default
slide_image_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

'???'
slide_mask_urlpath str

url/path to slide mask (.tif)

'???'
stain_sample_factor float

downsample factor to use for stain vector estimation

'???'
stain_channel int

which channel of the deconvovled image to use for texture analysis

'???'
tile_size int

size of tiles to use (at the requested magnification) (500-1000 recommended)

'???'
output_urlpath str

output/working directory

'.'

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/extract_stain_texture.py
@timed
@save_metadata
def cli(
    slide_image_urlpath: str = "???",
    slide_mask_urlpath: str = "???",
    stain_sample_factor: float = "???",  # type: ignore
    stain_channel: int = "???",  # type: ignore
    tile_size: int = "???",  # type: ignore
    output_urlpath: str = ".",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Compute GLCM texture features on a de-convolved slide image

    Args:
        slide_image_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        slide_mask_urlpath (str): url/path to slide mask (.tif)
        stain_sample_factor (float): downsample factor to use for stain vector estimation
        stain_channel (int): which channel of the deconvovled image to use for texture analysis
        tile_size (int): size of tiles to use (at the requested magnification) (500-1000 recommended)
        output_urlpath (str): output/working directory

    Returns:
        dict: metadata about function call

    """
    config = get_config(vars())
    df_result = extract_stain_texture(
        config["slide_image_urlpath"],
        config["slide_mask_urlpath"],
        config["stain_sample_factor"],
        config["stain_channel"],
        config["tile_size"],
        config["output_urlpath"],
        config["storage_options"],
        config["output_storage_options"],
    )

    fs, urlpath_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )
    output_filename = Path(urlpath_prefix) / "stainomics.parquet"
    with fs.open(output_filename, "wb") as of:
        df_result.to_parquet(of, index=False)

    properties = {
        # "num_pixel_observations": n,
        "feature_data": output_filename,
    }

    return properties

extract_stain_texture(slide_image_urlpath, slide_mask_urlpath, stain_sample_factor, stain_channel, tile_size, output_urlpath, storage_options, output_storage_options)

Compute GLCM texture after automatically deconvolving the image into stain channels, using tile-based processing

Runs statistics on distribution.

Save a feature csv file at the output directory.

Parameters:

Name Type Description Default
slide_image_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
slide_mask_urlpath str

url/path to slide mask (.tif)

required
stain_sample_factor float

downsample factor to use for stain vector estimation

required
stain_channel int

which channel of the deconvovled image to use for texture analysis

required
tile_size int

size of tiles to use (at the requested magnification) (500-1000 recommended)

required
output_urlpath str

output/working URL/path prefix

required
storage_options dict

storage options to pass to reading functions

required
output_storage_options dict

storage options to pass to writing functions

required

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/extract_stain_texture.py
def extract_stain_texture(
    slide_image_urlpath: str,
    slide_mask_urlpath: str,
    stain_sample_factor: float,
    stain_channel: int,
    tile_size: int,
    output_urlpath: str,
    storage_options: dict,
    output_storage_options: dict,
):
    """Compute GLCM texture after automatically deconvolving the image into stain channels, using tile-based processing

    Runs statistics on distribution.

    Save a feature csv file at the output directory.

    Args:
        slide_image_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        slide_mask_urlpath (str): url/path to slide mask (.tif)
        stain_sample_factor (float): downsample factor to use for stain vector estimation
        stain_channel (int): which channel of the deconvovled image to use for texture analysis
        tile_size (int): size of tiles to use (at the requested magnification) (500-1000 recommended)
        output_urlpath (str): output/working URL/path prefix
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: metadata about function call
    """
    with open(slide_image_urlpath, "rb", **storage_options) as slide_file:
        slide = tiffslide.TiffSlide(slide_file)
        # oslide = openslide.OpenSlide(slide_image_urlpath)

        logger.info(f"Slide dimensions {slide.dimensions}")
        sample_arr = get_downscaled_thumbnail(slide, stain_sample_factor)

    slide_full_generator, slide_full_level = get_full_resolution_generator(
        slide_image_urlpath, tile_size=tile_size, storage_options=storage_options
    )

    mask_full_generator, mask_full_level = get_full_resolution_generator(
        slide_mask_urlpath, tile_size=tile_size, storage_options=storage_options
    )

    stain_vectors = get_stain_vectors_macenko(sample_arr)

    logger.info(f"Stain vectors={stain_vectors}")

    tile_x_count, tile_y_count = slide_full_generator.level_tiles[slide_full_level]
    logger.info("Tiles x %s, Tiles y %s", tile_x_count, tile_y_count)

    # populate address, coordinates
    address_raster = [
        address
        for address in itertools.product(range(tile_x_count), range(tile_y_count))
    ]
    logger.info("Number of tiles in raster: %s", len(address_raster))

    features = defaultdict(list)

    N_tiles = len(address_raster)
    for n_tile, address in tqdm(enumerate(address_raster), file=sys.stdout):
        mask_patch = np.array(mask_full_generator.get_tile(mask_full_level, address))

        if not np.count_nonzero(mask_patch) > 1:
            continue

        image_patch = np.array(slide_full_generator.get_tile(slide_full_level, address))

        texture_values = extract_patch_texture_features(
            image_patch,
            mask_patch,
            stain_vectors,
            stain_channel,
            plot=False,
        )

        if texture_values is not None:
            for key, values in texture_values.items():
                features[key].append(values)
        logger.info(f"Processed Tile [{n_tile} / {N_tiles}] at {address}")
    for key, values in features.items():
        features[key] = np.concatenate(values).flatten()
    print(features)

    hist_features = {}
    fs, output_urlpath_prefix = fsspec.core.url_to_fs(
        output_urlpath, **output_storage_options
    )
    for key, values in features.items():
        output_path = Path(output_urlpath_prefix) / f"feature_vector_{key}.npy"
        with fs.open(output_path, "wb") as of:
            np.save(of, values)

        if not len(values) > 0:
            continue

        n, (smin, smax), sm, sv, ss, sk = scipy.stats.describe(values)

        if np.min(values) > 0:
            ln_params = scipy.stats.lognorm.fit(values, floc=0)
        else:
            ln_params = (0, 0, 0)

        fx_name_prefix = f"{key}_channel_{stain_channel}"
        hist_features.update(
            {
                f"{fx_name_prefix}_nobs": n,
                f"{fx_name_prefix}_min": smin,
                f"{fx_name_prefix}_max": smax,
                f"{fx_name_prefix}_mean": sm,
                f"{fx_name_prefix}_variance": sv,
                f"{fx_name_prefix}_skewness": ss,
                f"{fx_name_prefix}_kurtosis": sk,
                f"{fx_name_prefix}_lognorm_fit_p0": ln_params[0],
                f"{fx_name_prefix}_lognorm_fit_p2": ln_params[2],
            }
        )

    # The fit may fail sometimes, replace inf with 0
    df_result = (
        pd.DataFrame(data=hist_features, index=[0])
        .replace([np.inf, -np.inf], 0.0)
        .astype(float)
    )
    logger.info(df_result)

    return df_result

extract_tile_shape_features

__extract_tile_shape_features(objects_urlpath, tiles_urlpath, slide_urlpath, output_urlpath, resize_factor=16, detection_probability_threshold=None, slide_id='', statistical_descriptors=StatisticalDescriptors.ALL, cellular_features=CellularFeatures.ALL, property_type=PropertyType.ALL, include_smaller_regions=False, label_cols=None, storage_options={}, output_storage_options={}, properties=['area', 'convex_area', 'eccentricity', 'equivalent_diameter', 'euler_number', 'extent', 'label', 'major_axis_length', 'minor_axis_length', 'perimeter', 'solidity'])

Extracts shape and spatial features (HIF features) from a slide mask.

Args: objects_urlpath (str): URL/path to object file (geopandas supported formats) tiles_urlpath (str): URL/path to tiles manifest (parquet) slide_urlpath (str): URL/path to slide (tiffslide supported formats) output_urlpath (str): output URL/path resize_factor (int): factor to downsample slide image detection_probability_threshold (Optional[float]): detection probability threshold slide_id (str): Slide ID to add to dataframes statistical_descriptors (StatisticalDescriptors): statistical descriptors to calculate cellular_features (CellularFeatures): cellular features to include property_type (PropertyType): properties to include include_smaller_regions (bool): include smaller regions label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions properties (List[str]): list of whole slide image properties to extract. Needs to be parquet compatible (numeric). Returns: dict: output paths and the number of features generated

Source code in src/luna/pathology/cli/extract_tile_shape_features.py
def __extract_tile_shape_features(
    objects_urlpath: str,
    tiles_urlpath: str,
    slide_urlpath: str,
    output_urlpath: str,
    resize_factor: int = 16,
    detection_probability_threshold: Optional[float] = None,
    slide_id: str = "",
    statistical_descriptors: StatisticalDescriptors = StatisticalDescriptors.ALL,
    cellular_features: CellularFeatures = CellularFeatures.ALL,
    property_type: PropertyType = PropertyType.ALL,
    include_smaller_regions: bool = False,
    label_cols: List[str] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    properties: List[str] = [
        "area",
        "convex_area",
        "eccentricity",
        "equivalent_diameter",
        "euler_number",
        "extent",
        "label",
        "major_axis_length",
        "minor_axis_length",
        "perimeter",
        "solidity",
    ],
):
    """Extracts shape and spatial features (HIF features) from a slide mask.

     Args:
        objects_urlpath (str): URL/path to object file (geopandas supported formats)
        tiles_urlpath (str): URL/path to tiles manifest (parquet)
        slide_urlpath (str): URL/path to slide (tiffslide supported formats)
        output_urlpath (str): output URL/path
        resize_factor (int): factor to downsample slide image
        detection_probability_threshold (Optional[float]): detection
            probability threshold
        slide_id (str): Slide ID to add to dataframes
        statistical_descriptors (StatisticalDescriptors): statistical descriptors to calculate
        cellular_features (CellularFeatures): cellular features to include
        property_type (PropertyType): properties to include
        include_smaller_regions (bool): include smaller regions
        label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        properties (List[str]): list of whole slide image properties to
            extract. Needs to be parquet compatible (numeric).
    Returns:
        dict: output paths and the number of features generated
    """

    ofs, path = fsspec.core.url_to_fs(
        output_urlpath,
        **output_storage_options,
    )

    output_fpath = Path(path) / "shape_features.parquet"

    if ofs.exists(str(output_fpath)):
        logger.info(
            f"Output file already exist: {ofs.unstrip_protocol(str(output_fpath))}"
        )
        return {}

    with open(tiles_urlpath, **storage_options) as of:
        tiles_df = pd.read_parquet(of)

    with open(objects_urlpath, **storage_options) as of:
        object_gdf = gpd.read_file(of)

    with open(slide_urlpath, **storage_options) as of:
        slide = tiffslide.TiffSlide(of)
        slide_width = slide.dimensions[0]
        slide_height = slide.dimensions[1]

    if label_cols:
        tiles_df["Classification"] = tiles_df[label_cols].idxmax(axis=1)
    LabeledTileSchema.validate(tiles_df.reset_index())

    tile_area = tiles_df.iloc[0].tile_size ** 2

    counts = tiles_df.Classification.value_counts()

    combis = itertools.combinations(counts.index, 2)
    joint_entropy = []
    for i, j in combis:
        ent = {}
        ent["Parent"] = "whole_region"
        ent["Class"] = i
        ent["variable"] = f"Joint Entropy to {j}"
        ent["value"] = entropy(counts[[i, j]], base=2)
        joint_entropy.append(ent)

    entropy_df = pd.DataFrame(joint_entropy)

    shannon_entropy = entropy(counts, base=2)
    entropy_df = entropy_df.append(
        {
            "Parent": "whole_region",
            "Class": "All",
            "variable": "Entropy",
            "value": shannon_entropy,
        },
        ignore_index=True,
    )

    slide_area = counts * tile_area
    slide_area.index.name = "Parent"

    mask, mask_values = convert_tiles_to_mask(
        tiles_df, slide_width, slide_height, "Classification"
    )

    resized_mask = resize_array(mask, resize_factor)
    shape_features_df = extract_shape_features(
        resized_mask, mask_values, include_smaller_regions, properties
    )

    ann_region_polygons = [
        box(
            row.x_coord,
            row.y_coord,
            row.x_coord + row.xy_extent,
            row.y_coord + row.xy_extent,
        )
        for _, row in tiles_df.iterrows()
    ]
    tiles_gdf = gpd.GeoDataFrame(
        data=tiles_df, geometry=ann_region_polygons, crs="EPSG:4326"
    )

    logger.info("Spatially joining tiles and objects")
    gdf = object_gdf.sjoin(tiles_gdf, how="inner", predicate="within")
    if len(gdf) == 0:
        logger.info("No objects found within tiles")
        return None
    try:
        measurement_keys = list(gdf.measurements.iloc[0].keys())
        gdf = gdf.join(gdf.measurements.apply(lambda x: pd.Series(x)))
    except Exception:
        measurements = gdf.measurements.apply(
            lambda x: pd.DataFrame(json.loads(x)).set_index("name").squeeze()
        )
        measurement_keys = list(measurements.columns.values)
        gdf = gdf.join(measurements)
    gdf = gdf.join(gdf.classification.apply(lambda x: pd.Series(x)))
    gdf = gdf.rename(columns={"name": "Class", "Classification": "Parent"})

    gdf.Parent = gdf.Parent.astype("category")
    gdf.Class = gdf.Class.astype("category")

    if detection_probability_threshold:
        gdf = gdf.query(f"`Detection probability` > {detection_probability_threshold}")

    agg_keys = measurement_keys.copy()
    agg_keys.remove("Detection probability")
    logger.info("Calculating object measurement statistics")
    gb = gdf.groupby(by=["Parent", "Class"])[agg_keys]
    agg_funs = STATISTICAL_DESCRIPTOR_MAP[statistical_descriptors]
    agg_df = gb.agg(agg_funs)
    agg_df.columns = [" ".join(col).strip() for col in agg_df.columns.values]

    cell_density = None
    if "Cell: Area µm^2 sum" in agg_df.columns:
        cell_density = agg_df["Cell: Area µm^2 sum"] / (slide_area / 4)

    if cellular_features != CellularFeatures.ALL:
        agg_df = agg_df.filter(regex=cellular_features)

    if property_type != PropertyType.ALL:
        property_types = PROPERTY_TYPE_MAP[property_type]
        agg_df = agg_df.filter(regex="|".join(property_types))

    agg_df["Object Counts"] = gb.size()
    agg_df["Normalized Cell Density"] = agg_df["Object Counts"] / slide_area
    if cell_density is not None:
        agg_df["Cell Density"] = cell_density

    logger.info(
        "Calculating obj count log ratios between all tile label obj classification groups"
    )
    count_col = agg_df.columns.get_loc("Object Counts")
    idx0, idx1 = np.triu_indices(len(agg_df), 1)
    np.seterr(divide="ignore")
    ratio_df = pd.DataFrame(
        data={
            "variable": np.array(
                [
                    "Object Count Log Ratio to " + " ".join(row).strip()
                    for row in agg_df.index.values
                ]
            )[idx1],
            "value": np.log(agg_df.iloc[idx0, count_col].values)
            - np.log(agg_df.iloc[idx1, count_col].values),
        },
        index=agg_df.index[idx0],
    )

    mdf = pd.melt(agg_df.reset_index(), id_vars=["Parent", "Class"]).dropna()
    mdf = pd.concat([mdf, ratio_df.reset_index(), shape_features_df, entropy_df])

    if slide_id:
        mdf.insert(loc=0, column="slide_id", value=slide_id)

    mdf[["Parent", "Class", "variable"]] = mdf[["Parent", "Class", "variable"]].replace(
        r"_", " ", regex=True
    )

    with ofs.open(output_fpath, "wb") as of:
        mdf.to_parquet(of)

    props = {
        "shape_features_url": ofs.unstrip_protocol(str(output_fpath)),
        "num_features": len(mdf),
    }

    logger.info(props)

    return props

cli(slide_urlpath='???', object_urlpath='???', tiles_urlpath='???', output_urlpath='.', resize_factor=16, detection_probability_threshold=None, statistical_descriptors=StatisticalDescriptors.ALL, cellular_features=CellularFeatures.ALL, property_type=PropertyType.ALL, include_smaller_regions=False, label_cols=None, storage_options={}, output_storage_options={}, local_config='')

Extracts shape and spatial features (HIF features) from a slide mask.

Args: slide_urlpath (str): URL/path to slide (tiffslide supported formats) object_urlpath (str): URL/path to object file (geopandas supported formats) tiles_urlpath (str): URL/path to tiles manifest (parquet) output_urlpath (str): URL/path to output parquet file resize_factor (int): factor to downsample slide image detection_probability_threshold (Optional[float]): detection probability threshold statistical_descriptors (str): statistical descriptors to calculate. One of All, Quantiles, Stats, or Density cellular_features (str): cellular features to include. One of All, Nucleus, Cell, Cytoplasm, and Membrane property_type (str): properties to include. One of All, Geometric, or Stain include_smaller_regions (bool): include smaller regions in output label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions local_config (str): local config yaml file

Returns:

Name Type Description
dict

output paths and the number of features generated

Source code in src/luna/pathology/cli/extract_tile_shape_features.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    object_urlpath: str = "???",
    tiles_urlpath: str = "???",
    output_urlpath: str = ".",
    resize_factor: int = 16,
    detection_probability_threshold: Optional[float] = None,
    statistical_descriptors: str = StatisticalDescriptors.ALL,
    cellular_features: str = CellularFeatures.ALL,
    property_type: str = PropertyType.ALL,
    include_smaller_regions: bool = False,
    label_cols: List[str] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Extracts shape and spatial features (HIF features) from a slide mask.

     Args:
        slide_urlpath (str): URL/path to slide (tiffslide supported formats)
        object_urlpath (str): URL/path to object file (geopandas supported formats)
        tiles_urlpath (str): URL/path to tiles manifest (parquet)
        output_urlpath (str): URL/path to output parquet file
        resize_factor (int): factor to downsample slide image
        detection_probability_threshold (Optional[float]): detection probability threshold
        statistical_descriptors (str): statistical descriptors to calculate. One of All, Quantiles, Stats, or Density
        cellular_features (str): cellular features to include. One of All, Nucleus, Cell, Cytoplasm, and Membrane
        property_type (str): properties to include. One of All, Geometric, or Stain
        include_smaller_regions (bool): include smaller regions in output
        label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): local config yaml file

    Returns:
        dict: output paths and the number of features generated
    """
    config = get_config(vars())

    slide_id = Path(config["slide_urlpath"]).stem

    statistical_descriptors = config["statistical_descriptors"].capitalize()
    cellular_features = config["cellular_features"].capitalize()
    property_type = config["property_type"].capitalize()

    properties = __extract_tile_shape_features(
        config["object_urlpath"],
        config["tiles_urlpath"],
        config["slide_urlpath"],
        config["output_urlpath"],
        config["resize_factor"],
        config["detection_probability_threshold"],
        slide_id,
        statistical_descriptors,
        cellular_features,
        property_type,
        config["include_smaller_regions"],
        config["label_cols"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return properties

extract_tile_shape_features(slide_manifest, output_urlpath, resize_factor=16, detection_probability_threshold=None, statistical_descriptors=StatisticalDescriptors.ALL, cellular_features=CellularFeatures.ALL, property_type=PropertyType.ALL, include_smaller_regions=False, label_cols=None, storage_options={}, output_storage_options={}, objects_column='stardist_geojson_url', annotation_column='tile_shape_features_url', properties=['area', 'convex_area', 'eccentricity', 'equivalent_diameter', 'euler_number', 'extent', 'label', 'major_axis_length', 'minor_axis_length', 'perimeter', 'solidity'])

Extracts shape and spatial features (HIF features) from a slide mask.

Args: slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl output_urlpath (str): output URL/path resize_factor (int): factor to downsample slide image detection_probability_threshold (Optional[float]): detection probability threshold statistical_descriptors (str): statistical descriptors to calculate. One of All, Quantiles, Stats, or Density cellular_features (str): cellular features to include. One of All, Nucleus, Cell, Cytoplasm, and Membrane property_type (str): properties to include. One of All, Geometric, or Stain include_smaller_regions (bool): include smaller regions in output label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions local_config (str): local config yaml file objects_column (str): slide manifest column name with stardist geoJSON URLs annotation_column (str): column to add to slide manifest with url to extracted features properties (List[str]): properties to extract

Returns:

Type Description

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/extract_tile_shape_features.py
def extract_tile_shape_features(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    resize_factor: int = 16,
    detection_probability_threshold: Optional[float] = None,
    statistical_descriptors: StatisticalDescriptors = StatisticalDescriptors.ALL,
    cellular_features: CellularFeatures = CellularFeatures.ALL,
    property_type: PropertyType = PropertyType.ALL,
    include_smaller_regions: bool = False,
    label_cols: List[str] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    objects_column="stardist_geojson_url",
    annotation_column="tile_shape_features_url",
    properties: List[str] = [
        "area",
        "convex_area",
        "eccentricity",
        "equivalent_diameter",
        "euler_number",
        "extent",
        "label",
        "major_axis_length",
        "minor_axis_length",
        "perimeter",
        "solidity",
    ],
):
    """Extracts shape and spatial features (HIF features) from a slide mask.

     Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (str): output URL/path
        resize_factor (int): factor to downsample slide image
        detection_probability_threshold (Optional[float]): detection probability threshold
        statistical_descriptors (str): statistical descriptors to calculate. One of All, Quantiles, Stats, or Density
        cellular_features (str): cellular features to include. One of All, Nucleus, Cell, Cytoplasm, and Membrane
        property_type (str): properties to include. One of All, Geometric, or Stain
        include_smaller_regions (bool): include smaller regions in output
        label_cols (List[str]): list of score columns to use for the classification. Tile is classified as the column with the max score
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): local config yaml file
        objects_column (str): slide manifest column name with stardist geoJSON URLs
        annotation_column (str): column to add to slide manifest with url to extracted features
        properties (List[str]): properties to extract

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    client = get_or_create_dask_client()

    futures = []
    for _, row in slide_manifest.iterrows():
        future = client.submit(
            __extract_tile_shape_features,
            row[objects_column],
            row["tiles_url"],
            row["url"],
            output_urlpath,
            resize_factor,
            detection_probability_threshold,
            row["id"],
            statistical_descriptors,
            cellular_features,
            property_type,
            include_smaller_regions,
            label_cols,
            storage_options,
            output_storage_options,
            properties,
        )
        futures.append(future)

    progress(futures)
    results = client.gather(futures)

    return slide_manifest.assign(
        **{annotation_column: [x["shape_features_url"] for x in results]}
    )

extract_tile_statistics

cli(tiles_urlpath='???', output_urlpath='???', storage_options={}, output_storage_options={}, local_config='')

Extracts statistics over tiles

Parameters:

Name Type Description Default
tiles_urlpath str

Tiles parquet file for slide(s). Absolute or relative filepath. Prefix with protocol to read from alternative filesystems

'???'
output_urlpath str

Output prefix. Absolute or relative filepath. Prefix with protocol to write to alternative filesystems

'???'
storage_options dict

extra options that make sense for reading from a particular storage connection

{}
output_storage_options dict

extra options that make sense for writing to a particular storage connection

{}
local_config str

local config yaml file

''
Source code in src/luna/pathology/cli/extract_tile_statistics.py
@timed
@save_metadata
def cli(
    tiles_urlpath: str = "???",
    output_urlpath: str = "???",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Extracts statistics over tiles

    Args:
        tiles_urlpath (str): Tiles parquet file for slide(s). Absolute or relative filepath. Prefix with protocol to read from alternative filesystems
        output_urlpath (str): Output prefix. Absolute or relative filepath. Prefix with protocol to write to alternative filesystems
        storage_options (dict): extra options that make sense for reading from a particular storage connection
        output_storage_options (dict): extra options that make sense for writing to a particular storage connection
        local_config (str): local config yaml file

    """
    config = get_config(vars())

    df_feature_data = extract_tile_statistics(
        config["tiles_urlpath"],
        config["storage_options"],
    )

    fs, output_path_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )

    o = urlparse(config["tiles_urlpath"])
    id = Path(o.path).stem

    output_feature_file = Path(output_path_prefix) / f"{id}_tile_stats.parquet"

    logger.info(df_feature_data)
    with fs.open(output_feature_file, "wb") as f:
        df_feature_data.to_parquet(f)

    properties = {"feature_data": str(output_feature_file)}

    return properties

extract_tile_statistics(tiles_urlpath, storage_options)

Extracts statistics over tiles

Parameters:

Name Type Description Default
tiles_urlpath str

Tiles parquet file for slide(s). Absolute or relative filepath. Prefix with protocol to read from alternative filesystems

required
output_urlpath str

Output prefix. Absolute or relative filepath. Prefix with protocol to write to alternative filesystems

required
storage_options dict

extra options that make sense for reading from a particular storage connection

required
output_storage_options dict

extra options that make sense for writing to a particular storage connection

required

Returns:

Type Description

pd.DataFrame: metadata about function call

Source code in src/luna/pathology/cli/extract_tile_statistics.py
def extract_tile_statistics(
    tiles_urlpath: str,
    storage_options: dict,
):
    """Extracts statistics over tiles

    Args:
        tiles_urlpath (str): Tiles parquet file for slide(s). Absolute or relative filepath. Prefix with protocol to read from alternative filesystems
        output_urlpath (str): Output prefix. Absolute or relative filepath. Prefix with protocol to write to alternative filesystems
        storage_options (dict): extra options that make sense for reading from a particular storage connection
        output_storage_options (dict): extra options that make sense for writing to a particular storage connection

    Returns:
        pd.DataFrame: metadata about function call
    """

    df = (
        pd.read_parquet(tiles_urlpath, storage_options=storage_options)
        .reset_index()
        .set_index("address")
        .drop(
            columns=["x_coord", "y_coord", "tile_size", "xy_extent", "tile_units"],
            errors="ignore",
        )
    )
    print(df.columns)

    dict_feature_data = {}

    for col in df.columns:
        dict_feature_data.update(
            luna.common.stats.compute_stats_1d(pd.to_numeric(df[col]), col)
        )

    df_feature_data = pd.DataFrame([dict_feature_data])

    return df_feature_data

generate_mask

cli(slide_urlpath='???', roi_urlpath='???', output_urlpath='???', annotation_name='???', storage_options={}, output_storage_options={}, local_config='')

Generate a full resolution mask image (.tif) from vector annotations (polygons, shapes)

 Inputs: input_slide_image: slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) input_slide_roi: roi containing vector shapes (.annotations, .json)  Outputs: slide_mask  Example: generate_mask ./slides/10001.svs ./halo/10001.job18484.annotations -an Tumor -o ./masks/10001/

Source code in src/luna/pathology/cli/generate_mask.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    roi_urlpath: str = "???",
    output_urlpath: str = "???",
    annotation_name: str = "???",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Generate a full resolution mask image (.tif) from vector annotations (polygons, shapes)

    \b
    Inputs:
        input_slide_image: slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        input_slide_roi: roi containing vector shapes (*.annotations, *.json)
    \b
    Outputs:
        slide_mask
    \b
    Example:
        generate_mask ./slides/10001.svs ./halo/10001.job18484.annotations
            -an Tumor
            -o ./masks/10001/
    """
    config = get_config(vars())
    df = generate_mask(
        config["slide_urlpath"],
        config["roi_urlpath"],
        config["output_urlpath"],
        config["annotation_name"],
        config["storage_options"],
        config["output_storage_options"],
    )

    fs, output_urlpath_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )
    output_filename = Path(output_urlpath_prefix) / "mask_data.parquet"
    with fs.open(output_filename, "wb") as of:
        df.to_parquet(of)

    slide_id = Path(config["roi_urlpath"]).stem
    properties = {
        "slide_mask": Path(output_urlpath_prefix) / "mask_full_res.tif",
        "feature_data": output_filename,
        "mask_size": df["mask_size"].tolist(),
        "segment_keys": {"slide_id": slide_id},
    }

    return properties

generate_mask(slide_urlpath, roi_urlpath, output_urlpath, annotation_name, storage_options, output_storage_options)

Generate a full resolution mask image (.tif) from vector annotations (polygons, shapes)

Take into account positive and negative spaces. Essentially rasterizes a polygon file.

Parameters:

Name Type Description Default
slide_urlpath str

slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) absolute or relative path. prefix with scheme to use alternative file systems.

required
roi_urlpath str

halo or other polygonal annotation file (.xml, .geojson) absolute or relative path. prefix with scheme to use alternative file systems.

required
output_urlpath str

output/working absolute or relative path. prefix with scheme to use alternative file systems.

required
annotation_name str

name of annotation layer to use

required
storage_options dict

storage options that make sense for the file storage used

required

Returns:

Name Type Description
DataFrame

mask properties

Source code in src/luna/pathology/cli/generate_mask.py
@local_cache_urlpath(
    dir_key_write_mode={
        "output_urlpath": "w",
    }
)
def generate_mask(
    slide_urlpath: str,
    roi_urlpath: str,
    output_urlpath: str,
    annotation_name: str,
    storage_options: dict,
    output_storage_options: dict,
):
    """Generate a full resolution mask image (.tif) from vector annotations (polygons, shapes)

    Take into account positive and negative spaces.  Essentially rasterizes a polygon file.

    Args:
        slide_urlpath (str): slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) absolute or relative path. prefix with scheme to use alternative file systems.
        roi_urlpath (str):  halo or other polygonal annotation file (.xml, .geojson) absolute or relative path. prefix with scheme to use alternative file systems.
        output_urlpath (str): output/working absolute or relative path. prefix with scheme to use alternative file systems.
        annotation_name (str): name of annotation layer to use
        storage_options (dict): storage options that make sense for the file storage used

    Returns:
        DataFrame: mask properties
    """
    mask_properties = {}

    with open(slide_urlpath, **storage_options) as of:
        slide = tiffslide.TiffSlide(of)
        thumbnail = slide.get_thumbnail((1000, 1000))

    with open(Path(output_urlpath) / "slide_thumbnail.png", "wb") as of:
        thumbnail.save(of, format="PNG")

    wsi_shape = (
        slide.dimensions[1],
        slide.dimensions[0],
    )  # Annotation file has flipped dimensions w.r.t openslide conventions
    logger.info(f"Slide shape={wsi_shape}")

    layer_names = get_layer_names(roi_urlpath, storage_options)
    logger.info(f"Available layer names={layer_names}")

    mask_properties["layer_names"] = list(layer_names)
    mask_properties["mask_size"] = list(wsi_shape)

    mask_arr, xml_region_properties = convert_xml_to_mask(
        roi_urlpath, wsi_shape, annotation_name, storage_options=storage_options
    )

    mask_properties.update(xml_region_properties)

    logger.info(f"Generating mask thumbnail, mask size={mask_arr.shape}")
    mask_thumbnail = openslide.ImageSlide(
        Image.fromarray(
            255 * block_reduce(mask_arr, block_size=(10, 10), func=np.mean, cval=0.0)
        )
    ).get_thumbnail((1000, 1000))

    with open(Path(output_urlpath) / "mask_thumbnail.png", "wb") as of:
        mask_thumbnail.save(of, format="PNG")

    slide_mask_file = Path(output_urlpath) / "mask_full_res.tif"
    with open(slide_mask_file, "wb") as of:
        tifffile.imwrite(of, mask_arr)

    return pd.DataFrame(mask_properties)

generate_tile_labels

cli(annotation_urlpath='???', tiles_urlpath='???', slide_id='???', output_urlpath='???', storage_options={}, output_storage_options={}, local_config='')

Queries the dataset at input_slide_annotation_dataset for a slide_id matching input_slide_tiles

Adds regional_label, intersection_area columns to slide tiles, where the former is the annotation label, and the latter the fraction of intersecting area between the tile and annotation regions

Parameters:

Name Type Description Default
annotation_urlpath str

url/path to parquet annotation dataset

'???'
tiles_urlpath str

url/path to a slide-tile manifest file (.tiles.parquet)

'???'
slide_id str

slide ID

'???'
output_urlpath str

output url/path prefix

'???'
storage_options dict

options to pass to reading functions

{}
output_storage_options dict

options to pass to writing functions

{}
local_config str

url/path to local config YAML file

''

Returns: dict: metadata

Source code in src/luna/pathology/cli/generate_tile_labels.py
@timed
@save_metadata
def cli(
    annotation_urlpath: str = "???",
    tiles_urlpath: str = "???",
    slide_id: str = "???",
    output_urlpath: str = "???",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Queries the dataset at input_slide_annotation_dataset for a slide_id matching input_slide_tiles

    Adds regional_label, intersection_area columns to slide tiles, where the former is the annotation label, and the latter the fraction of intersecting area between the tile and annotation regions

    Args:
        annotation_urlpath (str): url/path to parquet annotation dataset
        tiles_urlpath (str): url/path to a slide-tile manifest file (.tiles.parquet)
        slide_id (str): slide ID
        output_urlpath (str): output url/path prefix
        storage_options (dict): options to pass to reading functions
        output_storage_options (dict): options to pass to writing functions
        local_config (str): url/path to local config YAML file
    Returns:
        dict: metadata
    """
    config = get_config(vars())

    df_tiles = generate_tile_labels(
        config["annotation_urlpath"],
        config["tiles_urlpath"],
        config["slide_id"],
        config["storage_options"],
    )

    fs, output_urlpath_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )
    output_header_file = (
        Path(output_urlpath_prefix)
        / f"{config['slide_id']}.regional_label.tiles.parquet"
    )
    with fs.open(output_header_file, "wb") as of:
        df_tiles.to_parquet(of)

    properties = {
        "slide_tiles": output_header_file,  # "Tiles" are the metadata that describe them
    }

    return properties

generate_tile_labels(annotation_urlpath, tiles_urlpath, slide_id, storage_options={})

Queries the dataset at input_slide_annotation_dataset for a slide_id matching input_slide_tiles

Adds regional_label, intersection_area columns to slide tiles, where the former is the annotation label, and the latter the fraction of intersecting area between the tile and annotation regions

Parameters:

Name Type Description Default
annotation_urlpath str

url/path to parquet annotation dataset

required
tiles_urlpath str

url/path to a slide-tile manifest file (.tiles.parquet)

required
slide_id str

slide ID

required
storage_options dict

options to pass to reading functions

{}

Returns: pd.DataFrame: tile dataframe with regional_label, and intersection_area columns

Source code in src/luna/pathology/cli/generate_tile_labels.py
def generate_tile_labels(
    annotation_urlpath: str,
    tiles_urlpath: str,
    slide_id: str,
    storage_options: dict = {},
):
    """Queries the dataset at input_slide_annotation_dataset for a slide_id matching input_slide_tiles

    Adds regional_label, intersection_area columns to slide tiles, where the former is the annotation label, and the latter the fraction of intersecting area between the tile and annotation regions

    Args:
        annotation_urlpath (str): url/path to parquet annotation dataset
        tiles_urlpath (str): url/path to a slide-tile manifest file (.tiles.parquet)
        slide_id (str): slide ID
        storage_options (dict): options to pass to reading functions
    Returns:
        pd.DataFrame: tile dataframe with regional_label, and intersection_area columns
    """
    slide_id = str(slide_id)
    logger.info(f"slide_id={slide_id}")

    with open(annotation_urlpath, **storage_options) as of:
        df_annotation = pd.read_parquet(of)

    if slide_id not in df_annotation.index:
        raise RuntimeError("No matching annotations found for slide!")

    df_annotation = df_annotation.loc[[slide_id]].query("type=='geojson'")

    if not len(df_annotation):
        raise RuntimeError("No matching geojson annotations found!")

    slide_geojson, collection_name, annotation_name = (
        df_annotation.slide_geojson.item(),
        df_annotation.collection_name.item(),
        df_annotation.annotation_name.item(),
    )

    print(slide_geojson, collection_name, annotation_name)

    with open(slide_geojson) as f:
        features = json.load(f)["features"]

    d_collections = {}

    for feature in features:
        label = feature["properties"]["label"]

        if label not in d_collections.keys():
            d_collections[label] = []

        d_collections[label].append(shape(feature["geometry"]).buffer(0))

    for label in d_collections.keys():
        d_collections[label] = GeometryCollection(d_collections[label])

    with open(tiles_urlpath, **storage_options) as of:
        df_tiles = pd.read_parquet(of).reset_index().set_index("address")
    l_regional_labels = []
    l_intersection_areas = []

    for _, row in tqdm(df_tiles.iterrows(), total=len(df_tiles)):
        tile_x, tile_y, tile_extent = row.x_coord, row.y_coord, row.xy_extent

        tile_polygon = Polygon(
            [
                (tile_x, tile_y),
                (tile_x, tile_y + tile_extent),
                (tile_x + tile_extent, tile_y + tile_extent),
                (tile_x + tile_extent, tile_y),
            ]
        )

        tile_label = None
        max_overlap = 0.0
        for label in d_collections.keys():
            intersection_area = (
                d_collections[label].intersection(tile_polygon).area / tile_polygon.area
            )
            if intersection_area > max_overlap:
                tile_label, max_overlap = label, intersection_area

        l_regional_labels.append(tile_label)
        l_intersection_areas.append(max_overlap)

    df_tiles["regional_label"] = l_regional_labels
    df_tiles["intersection_area"] = l_intersection_areas

    logger.info(df_tiles.loc[df_tiles.intersection_area > 0])

    return df_tiles

generate_tile_mask

cli(slide_urlpath='???', tiles_urlpath='', label_cols='???', output_urlpath='.', storage_options={}, output_storage_options={})

Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis.

Args: slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) tiles_urlpath (str): url/path to valid SlideTiles table label_cols (List[str]): list of label columns in the input_slide_tiles table to generate the mask with output_urlpath (str): output url/path prefix storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions

Returns:

Name Type Description
dict

output properties

Source code in src/luna/pathology/cli/generate_tile_mask.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    tiles_urlpath: str = "",
    label_cols: List[str] = "???",  # type: ignore
    output_urlpath: str = ".",
    storage_options: dict = {},
    output_storage_options: dict = {},
):
    """Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis.

     Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        tiles_urlpath (str): url/path to valid SlideTiles table
        label_cols (List[str]): list of label columns in the input_slide_tiles table to generate the mask with
        output_urlpath (str): output url/path prefix
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: output properties

    """
    config = get_config(vars())

    logger.info("Reading SlideTiles")
    with open(config["tiles_urlpath"], "rb", **config["storage_options"]) as of:
        tiles_df = pd.read_parquet(of).reset_index().set_index("address")

    with open(config["slide_urlpath"], **config["storage_options"]) as of:
        slide = tiffslide.TiffSlide(of)
        slide_width = slide.dimensions[0]
        slide_height = slide.dimensions[1]

    mask_arr, mask_values = convert_tiles_to_mask(
        tiles_df,
        slide_width,
        slide_height,
        config["label_cols"],
        config["output_urlpath"],
        config["output_storage_options"],
    )

    fs, output_path = fsspec.core.url_to_fs(config["output_urlpath"])

    slide_mask = Path(output_path) / "tile_mask.tif"
    properties = {
        "slide_mask": fs.unstrip_protocol(str(slide_mask)),
        "mask_values": mask_values,
        "mask_size": mask_arr.shape,
    }
    logger.info(properties)
    return properties

convert_tiles_to_mask(tiles_df, slide_width, slide_height, label_cols, output_urlpath='', output_storage_options={})

Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis.

Args: tiles_df (pd.DataFrame): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) slide_width (int): slide width slide_height (int): slide height label_cols (Union[str, List[str]]): column with labels or list of label columns in the tiles_urlpath table to generate the mask with

Returns:

Type Description

np.ndarray, Dict[int, str]: image mask, mask value mapping

Source code in src/luna/pathology/cli/generate_tile_mask.py
@multimethod
@local_cache_urlpath(
    dir_key_write_mode={"output_urlpath": "w"},
)
def convert_tiles_to_mask(
    tiles_df: pd.DataFrame,
    slide_width: int,
    slide_height: int,
    label_cols: Union[str, List[str]],
    output_urlpath: str = "",
    output_storage_options: dict = {},
):
    """Converts categorical tile labels to a slide image mask. This mask can be used for feature extraction and spatial analysis.

     Args:
        tiles_df (pd.DataFrame): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        slide_width (int): slide width
        slide_height (int): slide height
        label_cols (Union[str, List[str]]): column with labels or list of label columns in the tiles_urlpath table to generate the mask with

    Returns:
        np.ndarray, Dict[int, str]: image mask, mask value mapping

    """

    TileSchema.validate(tiles_df.reset_index())

    mask_arr = np.zeros((slide_height, slide_width), dtype=np.int8)

    if type(label_cols) == str:
        uniques = tiles_df[label_cols].unique()
        tiles_df["mask"] = tiles_df[label_cols].astype("category")
        mask_values = {k: v + 1 for v, k in enumerate(uniques)}
    else:
        tiles_df["mask"] = tiles_df[label_cols].idxmax(axis=1)
        tiles_df["mask"] = tiles_df["mask"].astype("category")
    mask_values = dict(zip(tiles_df["mask"], tiles_df["mask"].cat.codes + 1))

    logger.info(f"Mapping label column to mask values: {mask_values}")

    for address, row in tiles_df.iterrows():
        x, y, extent = int(row.x_coord), int(row.y_coord), int(row.xy_extent)

        value = mask_values[row["mask"]]

        # permuted rows and columns due to differences in indexing between openslide and skimage/numpy
        mask_arr[y : y + extent, x : x + extent] = value

        logger.info(f"{address}, {row['mask']}, {value}")

    if output_urlpath:
        slide_mask = Path(output_urlpath) / "tile_mask.tif"
        logger.info(f"Saving output mask to {slide_mask}")
        with open(slide_mask, "wb") as of:
            tifffile.imwrite(of, mask_arr)

    return mask_arr, mask_values

generate_tiles

__generate_tiles(slide_urlpath, tile_size, output_urlpath, force, requested_magnification=None, storage_options={}, output_storage_options={})

Rasterize a slide into smaller tiles

Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), and the corresponding manifest/header file (tiles.csv) is also generated

Necessary data for the manifest file are: address, tile_image_file, full_resolution_tile_size, tile_image_size_xy

Parameters:

Name Type Description Default
slide_urlpath str

slide url/path

required
tile_size int

size of tiles to use (at the requested magnification)

required
requested_magnification float

Magnification scale at which to perform computation

None

Returns:

Type Description
dict

DataFrame[TileSchema]: tile manifest

Source code in src/luna/pathology/cli/generate_tiles.py
def __generate_tiles(
    slide_urlpath: str,
    tile_size: int,
    output_urlpath: str,
    force: bool,
    requested_magnification: Optional[int] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
) -> dict:
    """Rasterize a slide into smaller tiles

    Tiles addresses and arrays are saved as key-value pairs in (tiles.h5),
    and the corresponding manifest/header file (tiles.csv) is also generated

    Necessary data for the manifest file are:
    address, tile_image_file, full_resolution_tile_size, tile_image_size_xy

    Args:
        slide_urlpath (str): slide url/path
        tile_size (int): size of tiles to use (at the requested magnification)
        requested_magnification (float): Magnification scale at which to perform computation

    Returns:
        DataFrame[TileSchema]: tile manifest
    """
    slide_id = Path(slide_urlpath).stem
    ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options)
    output_file = str(Path(output_path) / f"{slide_id}.tiles.parquet")
    if not force and ofs.exists(output_file):
        logger.info("Output file exists: {ofs.unstrip_protocol(output_file)}")
        return

    with fsspec.open(slide_urlpath, "rb", **storage_options) as f:
        slide = TiffSlide(f)
        logger.info(f"Slide size = [{slide.dimensions[0]},{slide.dimensions[1]}]")

        to_mag_scale_factor = get_scale_factor_at_magnification(
            slide, requested_magnification=requested_magnification
        )

        if not to_mag_scale_factor % 1 == 0:
            logger.error(f"Bad magnfication scale factor = {to_mag_scale_factor}")
            raise ValueError(
                "You chose a combination of requested tile sizes and magnification that resulted in non-integer tile sizes at different scales"
            )

        full_resolution_tile_size = int(tile_size * to_mag_scale_factor)
        logger.info(
            f"Normalized magnification scale factor for {requested_magnification}x is {to_mag_scale_factor}",
        )
        logger.info(
            f"Requested tile size={tile_size}, tile size at full magnification={full_resolution_tile_size}"
        )

    # get DeepZoomGenerator, level
    full_generator, full_level = get_full_resolution_generator(
        slide_urlpath,
        tile_size=full_resolution_tile_size,
        storage_options=storage_options,
    )
    tile_x_count, tile_y_count = full_generator.level_tiles[full_level]
    logger.info(f"tiles x {tile_x_count}, tiles y {tile_y_count}")

    # populate address, coordinates
    tiles = DataFrame[TileSchema](
        [
            Tile(
                address=coord_to_address(address, requested_magnification),
                x_coord=(address[0]) * full_resolution_tile_size,
                y_coord=(address[1]) * full_resolution_tile_size,
                xy_extent=full_resolution_tile_size,
                tile_size=tile_size,
                tile_units="px",
            ).__dict__
            for address in itertools.product(
                range(1, tile_x_count - 1), range(1, tile_y_count - 1)
            )
        ]
    )

    logger.info(f"Number of tiles in raster: {len(tiles)}")
    #    logger.info("Creating lazy tiles")
    #    lazy_tiles = [
    #            [dask.delayed(get_tile_from_slide)(tiles_df(x, y),
    #                                               full_resolution_tile_size,
    #                                               tile_size,
    #                                               slide)
    #             for y in range(1, tile_y_count - 1)]
    #            for x in range(1, tile_x_count - 1)]
    #    sample = lazy_tiles[0][0].compute()
    #
    #    lazy_arrays = da.stack([
    #        da.stack([da.from_delayed(lazy_tile, dtype=sample.dtype, shape=sample.shape)
    #                        for lazy_tile in inner] )
    #        for inner in lazy_tiles
    #        ])
    #    logger.info(f"lazy tiles: {lazy_arrays.shape}")

    with ofs.open(output_file, mode="wb") as of:
        tiles.to_parquet(of)

    properties = {
        "tiles_url": ofs.unstrip_protocol(
            output_file
        ),  # "Tiles" are the metadata that describe them
        "total_tiles": len(tiles),
    }

    return properties

cli(slide_urlpath='???', tile_size='???', requested_magnification=None, storage_options={}, output_storage_options={}, dask_options={}, local_config='', output_urlpath='.', force=False)

Rasterize a slide into smaller tiles, saving tile metadata as rows in a csv file

Necessary data for the manifest file are: address, x_coord, y_coord, xy_extent, tile_size, tile_units

 Inputs: input_slide_image: slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...) Outputs: slide_tiles  Example: generate_tiles 10001.svs -rts 244 -rmg 10 -o 10001/tiles

Source code in src/luna/pathology/cli/generate_tiles.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    tile_size: int = "???",  # type: ignore
    requested_magnification: Optional[int] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    dask_options: dict = {},
    local_config: str = "",
    output_urlpath: str = ".",
    force: bool = False,
) -> dict:
    """Rasterize a slide into smaller tiles, saving tile metadata as rows in a csv file

    Necessary data for the manifest file are:
    address, x_coord, y_coord, xy_extent, tile_size, tile_units

    \b
    Inputs:
        input_slide_image: slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
    Outputs:
        slide_tiles
    \b
    Example:
        generate_tiles 10001.svs
            -rts 244 -rmg 10
            -o 10001/tiles
    """
    config = get_config(vars())

    configure_dask_client(**config["dask_options"])

    properties = __generate_tiles(
        config["slide_urlpath"],
        config["tile_size"],
        config["output_urlpath"],
        config["force"],
        config["requested_magnification"],
        config["storage_options"],
        config["output_storage_options"],
    )

    return properties

infer_tile_labels

__infer_tile_labels(tiles_urlpath, slide_id, output_urlpath, force, torch_model_repo_or_dir, model_name, num_cores, batch_size, kwargs, use_gpu, insecure, storage_options, output_storage_options)

Run inference using a model and transform definition (either local or using torch.hub)

Decorates existing slide_tiles with additional columns corresponding to class prediction/scores from the model

Parameters:

Name Type Description Default
tiles_urlpath str

path to a slide-tile manifest file (.tiles.parquet)

required
slide_id str

slide ID

required
output_urlpath str

output/working directory

required
torch_model_repo_or_dir str

repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)

required
model_name str

torch hub model name (a nn.Module at the repo repo_name)

required
num_cores int

Number of cores to use for CPU parallelization

required
batch_size int

size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)

required
kwargs dict

additional keywords to pass to model initialization

required
use_gpu bool

use GPU if available

required
insecure bool

insecure SSL

required
storage_options dict

storage options to pass to reading functions

required
output_storage_options dict

storage options to pass to writing functions

required

Returns:

Name Type Description
dict

metadata

Source code in src/luna/pathology/cli/infer_tile_labels.py
def __infer_tile_labels(
    tiles_urlpath: str,
    slide_id: str,
    output_urlpath: str,
    force: bool,
    torch_model_repo_or_dir: str,
    model_name: str,
    num_cores: int,
    batch_size: int,
    kwargs: dict,
    use_gpu: bool,
    insecure: bool,
    storage_options: dict,
    output_storage_options: dict,
):
    """Run inference using a model and transform definition (either local or using torch.hub)

    Decorates existing slide_tiles with additional columns corresponding to class prediction/scores from the model

    Args:
        tiles_urlpath (str): path to a slide-tile manifest file (.tiles.parquet)
        slide_id (str): slide ID
        output_urlpath (str): output/working directory
        torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)
        model_name (str): torch hub model name (a nn.Module at the repo repo_name)
        num_cores (int): Number of cores to use for CPU parallelization
        batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)
        kwargs (dict): additional keywords to pass to model initialization
        use_gpu (bool): use GPU if available
        insecure (bool): insecure SSL
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: metadata
    """
    if insecure:
        ssl._create_default_https_context = ssl._create_unverified_context

    ofs, output_path_prefix = fsspec.core.url_to_fs(
        output_urlpath,
        **output_storage_options,
    )

    output_file = str(Path(output_path_prefix) / f"{slide_id}.tiles.parquet")

    if not force and ofs.exists(output_file):
        logger.info(f"outputs already exist: {output_file}")
        return

    tiles_df = (
        pd.read_parquet(tiles_urlpath, storage_options=storage_options)
        .reset_index()
        .set_index("address")
    )

    # Get our model and transforms and construct the Tile Dataset and Classifier
    if os.path.exists(torch_model_repo_or_dir):
        source = "local"
    else:
        source = "github"

    logger.info(f"Torch hub source = {source} @ {torch_model_repo_or_dir}")

    # if source == "github":
    # logger.info(f"Available models: {torch.hub.list(torch_model_repo_or_dir, trust_repo=False)}")

    ttm = torch.hub.load(
        torch_model_repo_or_dir,
        model_name,
        source=source,
        **kwargs,
        force_reload=True,
        trust_repo=True,
    )

    if not isinstance(ttm, TorchTransformModel):
        raise RuntimeError(f"Not a valid model, loaded model was of type {type(ttm)}")

    pin_memory = False
    if use_gpu and torch.cuda.is_available():
        pin_memory = True
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    logger.info(f"Using device = {device}")

    preprocess = ttm.get_preprocess()
    transform = ttm.transform
    ttm.model.to(device)

    ds = HDF5Dataset(tiles_df, preprocess=preprocess, storage_options=storage_options)
    loader = DataLoader(
        ds, num_workers=num_cores, batch_size=batch_size, pin_memory=pin_memory
    )

    # Generate aggregate dataframe
    with torch.no_grad():
        df_scores = pd.concat(
            [
                pd.DataFrame(
                    post_transform_to_2d(transform(data.to(device))), index=index
                )
                for data, index in tqdm(loader, file=sys.stdout)
            ]
        )

    if hasattr(ttm, "column_labels"):
        logger.info(f"Mapping column labels -> {ttm.column_labels}")
        df_scores = df_scores.rename(columns=ttm.column_labels)

    df_output = tiles_df.join(df_scores)
    df_output.columns = df_output.columns.astype(str)
    df_output.index.name = "address"

    logger.info(df_output)

    with ofs.open(output_file, "wb") as of:
        df_output.to_parquet(of)

    # Save our properties and params
    properties = {
        "tiles_url": ofs.unstrip_protocol(output_file),
        "total_tiles": len(df_output),
        "available_labels": list(df_output.columns),
    }

    return properties

cli(slide_urlpath='', tiles_urlpath='', tile_size=None, filter_query='', requested_magnification=None, torch_model_repo_or_dir='???', model_name='???', num_cores=4, batch_size=8, output_urlpath='.', force=False, kwargs={}, use_gpu=False, dask_options={}, insecure=False, storage_options={}, output_storage_options={})

Run inference using a model and transform definition (either local or using torch.hub)

Decorates existing slide_tiles with additional columns corresponding to class prediction/scores from the model

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with TiffSlide, .svs, .tif, .scn, ...)

''
tiles_urlpath str

path to a slide-tile manifest file (.tiles.csv)

''
tile_size Optional[int]

size of tiles to use (at the requested magnification)

None
filter_query str

pandas query by which to filter tiles based on their various tissue detection scores

''
requested_magnification Optional[int]

Magnification scale at which to perform computation

None
torch_model_repo_or_dir str

repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)

'???'
model_name str

torch hub model name (a nn.Module at the repo repo_name)

'???'
num_cores int

Number of cores to use for CPU parallelization

4
batch_size int

size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)

8
output_urlpath str

output/working directory

'.'
force bool

overwrite outputs if they exist

False
kwargs dict

additional keywords to pass to model initialization

{}
use_gpu bool

use GPU if available

False
dask_options dict

options to pass to dask client

{}
insecure bool

insecure SSL

False
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}

Returns:

Name Type Description
dict

metadata

Source code in src/luna/pathology/cli/infer_tile_labels.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "",
    tiles_urlpath: str = "",
    tile_size: Optional[int] = None,
    filter_query: str = "",
    requested_magnification: Optional[int] = None,
    torch_model_repo_or_dir: str = "???",
    model_name: str = "???",
    num_cores: int = 4,
    batch_size: int = 8,
    output_urlpath: str = ".",
    force: bool = False,
    kwargs: dict = {},
    use_gpu: bool = False,
    dask_options: dict = {},
    insecure: bool = False,
    storage_options: dict = {},
    output_storage_options: dict = {},
):
    """Run inference using a model and transform definition (either local or using torch.hub)

    Decorates existing slide_tiles with additional columns corresponding to class prediction/scores from the model

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with TiffSlide, .svs, .tif, .scn, ...)
        tiles_urlpath (str): path to a slide-tile manifest file (.tiles.csv)
        tile_size (Optional[int]): size of tiles to use (at the requested magnification)
        filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores
        requested_magnification (Optional[int]): Magnification scale at which to perform computation
        torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)
        model_name (str): torch hub model name (a nn.Module at the repo repo_name)
        num_cores (int): Number of cores to use for CPU parallelization
        batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)
        output_urlpath (str): output/working directory
        force (bool): overwrite outputs if they exist
        kwargs (dict): additional keywords to pass to model initialization
        use_gpu (bool): use GPU if available
        dask_options (dict): options to pass to dask client
        insecure (bool): insecure SSL
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: metadata
    """
    config = get_config(vars())
    configure_dask_client(**config["dask_options"])

    if not config["slide_urlpath"] and not config["tiles_urlpath"]:
        raise fire.core.FireError("Specify either tiles_urlpath or slide_urlpath")

    if not config["tile_size"] and not config["tiles_urlpath"]:
        raise fire.core.FireError("Specify either tiles_urlpath or tile_size")

    if config["slide_urlpath"]:
        slide_id = Path(config["slide_urlpath"]).stem
    else:
        slide_id = Path(config["tiles_urlpath"]).stem.removesuffix(".tiles")

    tiles_urlpath = config["tiles_urlpath"]
    with make_temp_directory() as temp_dir:
        if not tiles_urlpath:
            tiles_result = __generate_tiles(
                config["slide_urlpath"],
                config["tile_size"],
                (Path(temp_dir) / "generate_tiles").as_uri(),
                config["force"],
                config["tile_magnification"],
                config["storage_options"],
            )
            detect_tissue_result = __detect_tissue(
                config["slide_urlpath"],
                tiles_result["tiles_url"],
                slide_id,
                config["thumbnail_magnification"],
                config["filter_query"],
                config["batch_size"],
                (Path(temp_dir) / "detect_tissue").as_uri(),
                config["force"],
                config["storage_options"],
            )
            save_tiles_result = _save_tiles(
                detect_tissue_result["tiles_urlpath"],
                config["slide_urlpath"],
                (Path(temp_dir) / "save_tiles").as_uri(),
                config["force"],
                config["batch_size"],
                config["storage_options"],
            )
            tiles_urlpath = save_tiles_result["tiles_url"]

        return __infer_tile_labels(
            tiles_urlpath,
            slide_id,
            config["output_urlpath"],
            config["force"],
            config["torch_model_repo_or_dir"],
            config["model_name"],
            config["num_cores"],
            config["batch_size"],
            config["kwargs"],
            config["use_gpu"],
            config["insecure"],
            config["storage_options"],
            config["output_storage_options"],
        )

infer_tile_labels(slide_manifest, tile_size=None, filter_query='', thumbnail_magnification=None, tile_magnification=None, torch_model_repo_or_dir='', model_name='', num_cores=1, batch_size=2000, output_urlpath='.', force=True, kwargs={}, use_gpu=False, insecure=False, storage_options={}, output_storage_options={})

Run inference using a model and transform definition (either local or using torch.hub)

Decorates existing tiles manifests with additional columns corresponding to class prediction/scores from the model

Parameters:

Name Type Description Default
slide_manifest DataFrame

slide manifest from slide_etl

required
tile_size Optional[int]

size of tiles to use (at the requested magnification)

None
filter_query str

pandas query by which to filter tiles based on their various tissue detection scores

''
thumbnail_magnification Optional[int]

Magnification scale at which to detect tissue

None
tile_magnification Optional[int]

Magnification scale at which to generate tiles

None
torch_model_repo_or_dir str

repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)

''
model_name str

torch hub model name (a nn.Module at the repo repo_name)

''
num_cores int

Number of cores to use for CPU parallelization

1
batch_size int

size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)

2000
output_urlpath str

output/working directory

'.'
force bool

overwrite outputs if they exist

True
kwargs dict

additional keywords to pass to model initialization

{}
use_gpu bool

use GPU if available

False
insecure bool

insecure SSL

False
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}

Returns:

Type Description
DataFrame[SlideSchema]

pd.DataFrame: slide manifest

Source code in src/luna/pathology/cli/infer_tile_labels.py
def infer_tile_labels(
    slide_manifest: DataFrame[SlideSchema],
    tile_size: Optional[int] = None,
    filter_query: str = "",
    thumbnail_magnification: Optional[int] = None,
    tile_magnification: Optional[int] = None,
    torch_model_repo_or_dir: str = "",
    model_name: str = "",
    num_cores: int = 1,
    batch_size: int = 2000,
    output_urlpath: str = ".",
    force: bool = True,
    kwargs: dict = {},
    use_gpu: bool = False,
    insecure: bool = False,
    storage_options: dict = {},
    output_storage_options: dict = {},
) -> DataFrame[SlideSchema]:
    """Run inference using a model and transform definition (either local or using torch.hub)

    Decorates existing tiles manifests with additional columns corresponding to class prediction/scores from the model

    Args:
        slide_manifest (DataFrame): slide manifest from slide_etl
        tile_size (Optional[int]): size of tiles to use (at the requested magnification)
        filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores
        thumbnail_magnification (Optional[int]): Magnification scale at which to detect tissue
        tile_magnification (Optional[int]): Magnification scale at which to generate tiles
        torch_model_repo_or_dir (str): repository root name like (namespace/repo) at github.com to serve torch.hub models. Or path to a local model (e.g. msk-mind/luna-ml)
        model_name (str): torch hub model name (a nn.Module at the repo repo_name)
        num_cores (int): Number of cores to use for CPU parallelization
        batch_size (int): size in batch dimension to chuck inference (8-256 recommended, depending on memory usage)
        output_urlpath (str): output/working directory
        force (bool): overwrite outputs if they exist
        kwargs (dict): additional keywords to pass to model initialization
        use_gpu (bool): use GPU if available
        insecure (bool): insecure SSL
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        pd.DataFrame: slide manifest
    """
    client = get_or_create_dask_client()

    if "tiles_url" not in slide_manifest.columns:
        if tile_size is None:
            raise RuntimeError("Need to have generated tiles or specify tile_size")
        # generate tiles
        slide_manifest = detect_tissue(
            slide_manifest,
            None,
            tile_size=tile_size,
            thumbnail_magnification=thumbnail_magnification,
            tile_magnification=tile_magnification,
            filter_query=filter_query,
            batch_size=batch_size,
            storage_options=storage_options,
            output_urlpath=output_urlpath,
            force=force,
            output_storage_options=output_storage_options,
        )

        slide_manifest = save_tiles(
            slide_manifest,
            output_urlpath,
            force,
            batch_size,
            storage_options,
            output_storage_options,
        )

    futures = []
    for row in slide_manifest.itertuples(name="Slide"):
        future = client.submit(
            __infer_tile_labels,
            row.tiles_url,
            row.id,
            output_urlpath,
            force,
            torch_model_repo_or_dir,
            model_name,
            num_cores,
            batch_size,
            kwargs,
            use_gpu,
            insecure,
            storage_options,
            output_storage_options,
        )
        futures.append(future)

    progress(futures)
    results = client.gather(futures)
    return slide_manifest.assign(tiles_url=[x["tiles_url"] for x in results])

merge_shape_features

cli(shape_features_urlpaths='???', output_urlpath='.', flatten_index=True, fraction_not_null=0.5, storage_options={}, output_storage_options={}, local_config='')

Merges shape features dataframes

Parameters:

Name Type Description Default
shape_features_urlpaths List[str]

URL/paths to shape featurs parquet files

'???'
output_urlpath str

URL/path to output parquet file

'.'
fraction_not_null float

fraction not null to keep column to keep in wide format

0.5
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
local_config str

local config yaml file

''

Returns:

Name Type Description
dict

output paths and the number of features generated

Source code in src/luna/pathology/cli/merge_shape_features.py
@timed
@save_metadata
def cli(
    shape_features_urlpaths: Union[str, List[str]] = "???",
    output_urlpath: str = ".",
    flatten_index: bool = True,
    fraction_not_null: float = 0.5,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Merges shape features dataframes

    Args:
        shape_features_urlpaths (List[str]): URL/paths to shape featurs parquet files
        output_urlpath (str): URL/path to output parquet file
        fraction_not_null (float): fraction not null to keep column to keep in wide format
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): local config yaml file

    Returns:
        dict: output paths and the number of features generated
    """
    config = get_config(vars())

    dfs = []  # type: list[str]
    if type(config["shape_features_urlpaths"]) == list:
        for urlpath in config["shape_features_urlpaths"]:
            fs, path = fsspec.core.url_to_fs(urlpath, **config["storage_options"])
            with fs.open(path, "rb") as of:
                df = pd.read_parquet(of)
            dfs.append(df)
    else:
        fs, path_prefix = fsspec.core.url_to_fs(
            config["shape_features_urlpaths"], **config["storage_options"]
        )
        for path in fs.glob(f"{path_prefix}/**/shape_features.parquet"):
            with fs.open(path, "rb") as of:
                df = pd.read_parquet(of)
            dfs.append(df)

    df = pd.concat(dfs)
    fs, path_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )
    path = Path(path_prefix) / "long_shape_features.parquet"

    with fs.open(path, "wb", **config["output_storage_options"]) as of:
        df.to_parquet(of)

    df.variable = (
        df.variable.str.replace("µ", "u")
        .replace(r"(: |:)", " ", regex=True)
        .replace("[^a-zA-Z0-9 \n]", "", regex=True)
    )
    wide_path = Path(path_prefix) / "wide_shape_features.parquet"
    wide_df = df.pivot(
        index="slide_id", columns=["Parent", "Class", "variable"], values="value"
    )
    wide_df = wide_df.loc[
        :, wide_df.isna().sum() < len(wide_df) * config["fraction_not_null"]
    ]
    if config["flatten_index"]:
        wide_df.columns = ["_".join(col).strip() for col in wide_df.columns.values]
        wide_df.columns = wide_df.columns.str.replace(" ", "_")

    with fs.open(wide_path, "wb", **config["output_storage_options"]) as of:
        wide_df.to_parquet(of)

    return {
        "long_shape_features": fs.unstrip_protocol(str(path)),
        "wide_shape_features": fs.unstrip_protocol(str(wide_path)),
        "num_features": len(wide_df.columns),
    }

run_stardist_cell_detection

__stardist_cell_lymphocyte(slide_urlpath, output_urlpath, slide_id, num_cores, use_gpu=False, image='mskmind/qupath-stardist:0.4.3', use_singularity=False, max_heap_size='64G', storage_options={}, output_storage_options={})

Run stardist using qupath CLI

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
output_urlpath str

output url/path

required
num_cores int

Number of cores to use for CPU parallelization

required
use_gpu bool

use GPU

False
image str

docker/singularity image

'mskmind/qupath-stardist:0.4.3'
use_singularity bool

use singularity instead of docker

False
max_heap_size str

maximum heap size to pass to java options

'64G'
storage_options dict

storage options to pass to reading functions

{}

Returns:

Name Type Description
dict dict

run metadata

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
@local_cache_urlpath(
    file_key_write_mode={"slide_urlpath": "r"},
    dir_key_write_mode={"output_urlpath": "w"},
)
def __stardist_cell_lymphocyte(
    slide_urlpath: str,
    output_urlpath: str,
    slide_id: str,
    num_cores: int,
    use_gpu: bool = False,
    image: str = "mskmind/qupath-stardist:0.4.3",
    use_singularity: bool = False,
    max_heap_size: str = "64G",
    storage_options: dict = {},
    output_storage_options: dict = {},
) -> dict:
    """Run stardist using qupath CLI

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        output_urlpath (str): output url/path
        num_cores (int): Number of cores to use for CPU parallelization
        use_gpu (bool): use GPU
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions

    Returns:
        dict: run metadata
    """
    fs, slide_path = fsspec.core.url_to_fs(slide_urlpath, **storage_options)
    ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options)

    output_header_file = Path(output_path) / f"{slide_id}_cell_objects.parquet"
    if ofs.exists(output_header_file):
        logger.info(f"outputs already exist: {output_header_file}")
        return

    if ofs.protocol == "file" and not ofs.exists(output_path):
        ofs.mkdir(output_path)

    qupath_cmd = "QuPath-cpu"
    if use_gpu:
        qupath_cmd = "QuPath-gpu"

    runner_type = "DOCKER"
    if use_singularity:
        runner_type = "SINGULARITY"

    slide_filename = Path(slide_path).name
    command = f"{qupath_cmd} script --image /inputs/{slide_filename} /scripts/stardist_nuclei_and_lymphocytes.groovy"
    logger.info(f"Launching {runner_type} container:")
    logger.info(
        f"\tvolumes={slide_path}:'/inputs/{slide_filename}', {output_path}:'/output_dir'"
    )
    logger.info(f"\tnano_cpus={int(num_cores * 1e9)}")
    logger.info(f"\timage='{image}'")
    logger.info(f"\tcommand={command}")

    volumes_map = {
        slide_path: f"/inputs/{slide_filename}",
        output_path: "/output_dir",
    }

    runner_config = {
        "image": image,
        "command": command,
        "num_cores": num_cores,
        "max_heap_size": max_heap_size,
        "volumes_map": volumes_map,
        "use_gpu": use_gpu,
    }
    runner = runner_provider.get(runner_type, **runner_config)
    executor = runner.run()
    try:
        for line in executor:
            logger.info(line)
    except TypeError:
        print(executor, "is not iterable")

    stardist_output = Path(output_path) / "cell_detections.tsv"

    df = pd.read_csv(stardist_output, sep="\t")
    df.index = "cell-" + df.index.astype(int).astype(str)
    df.index.rename("cell_id", inplace=True)

    df = df.rename(
        columns={"Centroid X µm": "x_coord", "Centroid Y µm": "y_coord"}
    )  # x,ys follow this convention

    with fs.open(output_header_file, "wb") as of:
        df.to_parquet(of)

    logger.info("generated cell data:")
    logger.info(df)

    output_geojson_file = Path(output_path) / "cell_detections.geojson"

    properties = {
        "geojson_url": ofs.unstrip_protocol(str(output_geojson_file)),
        "tsv_url": ofs.unstrip_protocol(str(stardist_output)),
        "parquet_url": ofs.unstrip_protocol(str(output_header_file)),
        "spatial": True,
        "total_cells": len(df),
    }

    return properties

__stardist_simple(slide_urlpath, cell_expansion_size, image_type, output_urlpath, debug_opts, num_cores, image, use_singularity, max_heap_size, storage_options, output_storage_options)

Run stardist using qupath CLI on slides in a slide manifest from slide_etl. URIs to resulting GeoJSON will be stored in a specified column of the returned slide manifest.

Parameters:

Name Type Description Default
slide_urlpath str

path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
cell_expansion_size float

size in pixels to expand cell cytoplasm

required
num_cores int

Number of cores to use for CPU parallelization

required
image_type str

qupath image type (BRIGHTFIELD_H_DAB)

required
output_urlpath str

output url/path

required
debug_opts str

debug options passed as arguments to groovy script

required
image str

docker/singularity image

required
use_singularity bool

use singularity instead of docker

required
max_heap_size str

maximum heap size to pass to java options

required
storage_options dict

storage options to pass to reading functions

required
output_storage_options dict

storage options to pass to writing functions

required

Returns:

Name Type Description
dict dict

run metadata

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
@local_cache_urlpath(
    file_key_write_mode={"slide_urlpath": "r"},
    dir_key_write_mode={"output_urlpath": "w"},
)
def __stardist_simple(
    slide_urlpath: str,
    cell_expansion_size: float,
    image_type: str,
    output_urlpath: str,
    debug_opts: str,
    num_cores: int,
    image: str,
    use_singularity: bool,
    max_heap_size: str,
    storage_options: dict,
    output_storage_options: dict,
) -> dict:
    """Run stardist using qupath CLI on slides in a slide manifest from
    slide_etl. URIs to resulting GeoJSON will be stored in a specified column
    of the returned slide manifest.

    Args:
        slide_urlpath (str): path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        cell_expansion_size (float): size in pixels to expand cell cytoplasm
        num_cores (int): Number of cores to use for CPU parallelization
        image_type (str): qupath image type (BRIGHTFIELD_H_DAB)
        output_urlpath (str): output url/path
        debug_opts (str): debug options passed as arguments to groovy script
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: run metadata
    """
    fs, slide_path = fsspec.core.url_to_fs(slide_urlpath, **storage_options)
    ofs, output_path = fsspec.core.url_to_fs(output_urlpath, **output_storage_options)

    slide_id = Path(slide_urlpath).stem
    output_header_file = Path(output_path) / f"{slide_id}_cell_objects.parquet"
    if ofs.exists(output_header_file):
        logger.info(f"outputs already exist: {output_header_file}")
        return

    if ofs.protocol == "file" and not ofs.exists(output_path):
        ofs.mkdir(output_path)

    runner_type = "DOCKER"
    if use_singularity:
        runner_type = "SINGULARITY"

    slide_filename = Path(slide_path).name
    command = f"echo QuPath script --image /inputs/{slide_filename} --args [cellSize={cell_expansion_size},imageType={image_type},{debug_opts}] /scripts/stardist_simple.groovy"
    logger.info(f"Launching QuPath via {runner_type}:{image} ...")
    logger.info(
        f"\tvolumes={slide_urlpath}:'/inputs/{slide_filename}', {slide_path}:'/output_dir'"
    )
    logger.info(f"\tnano_cpus={int(num_cores * 1e9)}")
    logger.info(f"\timage='{image}'")
    logger.info(f"\tcommand={command}")

    volumes_map = {
        slide_path: f"/inputs/{slide_filename}",
        output_path: "/output_dir",
    }

    runner_config = {
        "image": image,
        "command": command,
        "num_cores": num_cores,
        "max_heap_size": max_heap_size,
        "volumes_map": volumes_map,
    }
    runner = runner_provider.get(runner_type, **runner_config)
    executor = runner.run()
    try:
        for line in executor:
            logger.info(line)
    except TypeError:
        print(executor, "is not iterable")

    stardist_output = Path(output_path) / "cell_detections.tsv"

    df = pd.read_csv(stardist_output, sep="\t")
    df.index = "cell-" + df.index.astype(int).astype(str)
    df.index.rename("cell_id", inplace=True)

    df = df.rename(
        columns={"Centroid X µm": "x_coord", "Centroid Y µm": "y_coord"}
    )  # x,ys follow this convention

    with ofs.open(output_header_file, "wb") as of:
        df.to_parquet(of)

    logger.info("generated cell data:")
    logger.info(df)

    output_geojson_file = Path(output_path) / "cell_detections.geojson"

    properties = {
        "geojson_url": ofs.unstrip_protocol(str(output_geojson_file)),
        "tsv_url": ofs.unstrip_protocol(str(stardist_output)),
        "parquet_url": ofs.unstrip_protocol(str(output_header_file)),
        "spatial": True,
        "total_cells": len(df),
    }

    return properties

stardist_cell_lymphocyte(slide_manifest, output_urlpath, num_cores, use_gpu=False, image='mskmind/qupath-stardist:0.4.3', use_singularity=False, max_heap_size='64G', storage_options={}, output_storage_options={}, annotation_column='lymphocyte_geojson_url')

Run stardist using qupath CLI

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath str

output url/path

required
num_cores int

Number of cores to use for CPU parallelization

required
use_gpu bool

use GPU

False
image str

docker/singularity image

'mskmind/qupath-stardist:0.4.3'
use_singularity bool

use singularity instead of docker

False
max_heap_size str

maximum heap size to pass to java options

'64G'
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
annotation_column str

name of column in resulting slide manifest to store GeoJson URIs

'lymphocyte_geojson_url'

Returns:

Type Description
DataFrame[SlideSchema]

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
def stardist_cell_lymphocyte(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    num_cores: int,
    use_gpu: bool = False,
    image: str = "mskmind/qupath-stardist:0.4.3",
    use_singularity: bool = False,
    max_heap_size: str = "64G",
    storage_options: dict = {},
    output_storage_options: dict = {},
    annotation_column: str = "lymphocyte_geojson_url",
) -> DataFrame[SlideSchema]:
    """Run stardist using qupath CLI

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (str): output url/path
        num_cores (int): Number of cores to use for CPU parallelization
        use_gpu (bool): use GPU
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        annotation_column (str): name of column in resulting slide manifest to store GeoJson URIs

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    client = get_or_create_dask_client()

    futures = []
    for row in slide_manifest.itertuples(name="Slide"):
        fs, output_path = fsspec.core.url_to_fs(
            output_urlpath, **output_storage_options
        )
        future = client.submit(
            __stardist_cell_lymphocyte,
            row.url,
            fs.unstrip_protocol(str(Path(output_path) / row.id)),
            row.id,
            num_cores,
            use_gpu,
            image,
            use_singularity,
            max_heap_size,
            storage_options,
            output_storage_options,
        )
        futures.append(future)
    results = client.gather(futures)
    return slide_manifest.assign(
        **{annotation_column: [x["geojson_url"] for x in results]}
    )

stardist_cell_lymphocyte_cli(slide_urlpath='???', output_urlpath='.', num_cores=1, use_gpu=False, image='mskmind/qupath-stardist:0.4.3', use_singularity=False, max_heap_size='64G', storage_options={}, output_storage_options={})

Run stardist using qupath CLI

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

'???'
output_urlpath str

output url/path

'.'
num_cores int

Number of cores to use for CPU parallelization

1
use_gpu bool

use GPU

False
image str

docker/singularity image

'mskmind/qupath-stardist:0.4.3'
use_singularity bool

use singularity instead of docker

False
max_heap_size str

maximum heap size to pass to java options

'64G'
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}

Returns:

Name Type Description
dict dict

run metadata

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
@timed
@save_metadata
def stardist_cell_lymphocyte_cli(
    slide_urlpath: str = "???",
    output_urlpath: str = ".",
    num_cores: int = 1,
    use_gpu: bool = False,
    image: str = "mskmind/qupath-stardist:0.4.3",
    use_singularity: bool = False,
    max_heap_size: str = "64G",
    storage_options: dict = {},
    output_storage_options: dict = {},
) -> dict:
    """Run stardist using qupath CLI

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        output_urlpath (str): output url/path
        num_cores (int): Number of cores to use for CPU parallelization
        use_gpu (bool): use GPU
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions

    Returns:
        dict: run metadata
    """
    config = get_config(vars())
    slide_id = Path(config["slide_urlpath"]).stem
    properties = __stardist_cell_lymphocyte(
        config["slide_urlpath"],
        config["output_urlpath"],
        slide_id,
        config["num_cores"],
        config["use_gpu"],
        config["image"],
        config["use_singularity"],
        config["max_heap_size"],
        config["storage_options"],
        config["output_storage_options"],
    )
    return properties

stardist_simple(slide_manifest, cell_expansion_size, image_type, output_urlpath, debug_opts, num_cores, image, use_singularity, max_heap_size, storage_options, output_storage_options, annotation_column='stardist_geojson_url')

Run stardist using qupath CLI on slides in a slide manifest from slide_etl. URIs to resulting GeoJSON will be stored in a specified column of the returned slide manifest.

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
cell_expansion_size float

size in pixels to expand cell cytoplasm

required
image_type str

qupath image type (BRIGHTFIELD_H_DAB)

required
output_urlpath str

output url/path

required
debug_opts str

debug options passed as arguments to groovy script

required
num_cores int

Number of cores to use for CPU parallelization

required
image str

docker/singularity image

required
use_singularity bool

use singularity instead of docker

required
max_heap_size str

maximum heap size to pass to java options

required
storage_options dict

storage options to pass to reading functions

required
output_storage_options dict

storage options to pass to writing functions

required
annotation_column str

name of column in resulting slide manifest to store GeoJson URIs

'stardist_geojson_url'

Returns:

Type Description
DataFrame[SlideSchema]

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
def stardist_simple(
    slide_manifest: DataFrame[SlideSchema],
    cell_expansion_size: float,
    image_type: str,
    output_urlpath: str,
    debug_opts: str,
    num_cores: int,
    image: str,
    use_singularity: bool,
    max_heap_size: str,
    storage_options: dict,
    output_storage_options: dict,
    annotation_column: str = "stardist_geojson_url",
) -> DataFrame[SlideSchema]:
    """Run stardist using qupath CLI on slides in a slide manifest from
    slide_etl. URIs to resulting GeoJSON will be stored in a specified column
    of the returned slide manifest.

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        cell_expansion_size (float): size in pixels to expand cell cytoplasm
        image_type (str): qupath image type (BRIGHTFIELD_H_DAB)
        output_urlpath (str): output url/path
        debug_opts (str): debug options passed as arguments to groovy script
        num_cores (int): Number of cores to use for CPU parallelization
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        annotation_column (str): name of column in resulting slide manifest to store GeoJson URIs

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """

    client = get_or_create_dask_client()

    futures = []
    for row in slide_manifest.itertuples(name="Slide"):
        future = client.submit(
            __stardist_simple,
            row.url,
            cell_expansion_size,
            image_type,
            output_urlpath,
            debug_opts,
            num_cores,
            image,
            use_singularity,
            max_heap_size,
            storage_options,
            output_storage_options,
        )
        futures.append(future)
    results = client.gather(futures)
    return slide_manifest.assign(
        **{annotation_column: [x["geojson_url"] for x in results]}
    )

stardist_simple_cli(slide_urlpath='???', cell_expansion_size='???', image_type='???', output_urlpath='.', debug_opts='', num_cores=1, image='mskmind/qupath-stardist:0.4.3', use_singularity=False, max_heap_size='64G', storage_options={}, output_storage_options={}, local_config='')

Run stardist using qupath CLI

Parameters:

Name Type Description Default
input_slide_image str

path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
cell_expansion_size float

size in pixels to expand cell cytoplasm

'???'
num_cores int

Number of cores to use for CPU parallelization

1
image_type str

qupath image type (BRIGHTFIELD_H_DAB)

'???'
output_urlpath str

output url/path

'.'
debug_opts str

debug options passed as arguments to groovy script

''
image str

docker/singularity image

'mskmind/qupath-stardist:0.4.3'
use_singularity bool

use singularity instead of docker

False
max_heap_size str

maximum heap size to pass to java options

'64G'
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
local_config str

local config yaml file

''

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/run_stardist_cell_detection.py
@timed
@save_metadata
def stardist_simple_cli(
    slide_urlpath: str = "???",
    cell_expansion_size: float = "???",  # type: ignore
    image_type: str = "???",
    output_urlpath: str = ".",
    debug_opts: str = "",
    num_cores: int = 1,
    image: str = "mskmind/qupath-stardist:0.4.3",
    use_singularity: bool = False,
    max_heap_size: str = "64G",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Run stardist using qupath CLI

    Args:
        input_slide_image (str): path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        cell_expansion_size (float): size in pixels to expand cell cytoplasm
        num_cores (int): Number of cores to use for CPU parallelization
        image_type (str): qupath image type (BRIGHTFIELD_H_DAB)
        output_urlpath (str): output url/path
        debug_opts (str): debug options passed as arguments to groovy script
        image (str): docker/singularity image
        use_singularity (bool): use singularity instead of docker
        max_heap_size (str): maximum heap size to pass to java options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): local config yaml file

    Returns:
        dict: metadata about function call
    """

    config = get_config(vars())

    return __stardist_simple(
        config["slide_urlpath"],
        config["cell_expansion_size"],
        config["image_type"],
        config["output_urlpath"],
        config["debug_opts"],
        config["num_cores"],
        config["image"],
        config["use_singularity"],
        config["max_heap_size"],
        config["storage_options"],
        config["output_storage_options"],
    )

run_tissue_detection

cli(slide_urlpath='???', tiles_urlpath='', filter_query='???', tile_size=None, thumbnail_magnification=None, tile_magnification=None, batch_size=2000, output_urlpath='.', force=False, dask_options={}, storage_options={}, output_storage_options={}, local_config='')

Run simple/deterministic tissue detection algorithms based on a filter query, to reduce tiles to those (likely) to contain actual tissue Args: slide_urlpath (str): url/path to slide image (virtual slide formats compatible with pyvips, .svs, .tif, .scn, ...) tiles_urlpath (str): url/path to tiles manifest (parquet) filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores tile_size (int): size of tiles to use (at the requested magnification) thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection tile_magnification (Optional[int]): Magnification scale at which to generate tiles batch_size (int): batch size for processing output_urlpath (str): Output url/path force (bool): overwrite outputs if they exist dask_options (dict): dask options storage_options (dict): storage options to pass to reading functions output_storage_options (dict): storage options to pass to writing functions local_config (str): local config file Returns: dict: metadata about cli function call

Source code in src/luna/pathology/cli/run_tissue_detection.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    tiles_urlpath: str = "",
    filter_query: str = "???",
    tile_size: Optional[int] = None,
    thumbnail_magnification: Optional[int] = None,
    tile_magnification: Optional[int] = None,
    batch_size: int = 2000,
    output_urlpath: str = ".",
    force: bool = False,
    dask_options: dict = {},
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
) -> dict:
    """Run simple/deterministic tissue detection algorithms based on a filter query, to reduce tiles to those (likely) to contain actual tissue
    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with pyvips, .svs, .tif, .scn, ...)
        tiles_urlpath (str): url/path to tiles manifest (parquet)
        filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores
        tile_size (int): size of tiles to use (at the requested magnification)
        thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection
        tile_magnification (Optional[int]): Magnification scale at which to generate tiles
        batch_size (int): batch size for processing
        output_urlpath (str): Output url/path
        force (bool): overwrite outputs if they exist
        dask_options (dict): dask options
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): local config file
    Returns:
        dict: metadata about cli function call

    """
    config = get_config(vars())

    configure_dask_client(**config["dask_options"])

    if not config["tile_size"] and not config["tiles_urlpath"]:
        raise fire.core.FireError("Specify either tiles_urlpath or tile_size")

    slide_id = Path(config["slide_urlpath"]).stem

    tiles_urlpath = config["tiles_urlpath"]

    with make_temp_directory() as temp_dir:
        if not tiles_urlpath:
            result = __generate_tiles(
                config["slide_urlpath"],
                config["tile_size"],
                temp_dir,
                config["force"],
                config["tile_magnification"],
                config["storage_options"],
            )
            tiles_urlpath = result["tiles_url"]

        properties = __detect_tissue(
            config["slide_urlpath"],
            tiles_urlpath,
            slide_id,
            config["thumbnail_magnification"],
            config["filter_query"],
            config["batch_size"],
            config["output_urlpath"],
            config["force"],
            config["storage_options"],
            config["output_storage_options"],
        )

    return properties

compute_otsu_score(tile, slide_path, otsu_threshold)

Return otsu score for the tile. Args: row (pd.Series): row with tile metadata slide_path (str): path to slide otsu_threshold (float): otsu threshold value

Source code in src/luna/pathology/cli/run_tissue_detection.py
def compute_otsu_score(tile: Tile, slide_path: str, otsu_threshold: float) -> float:
    """
    Return otsu score for the tile.
    Args:
        row (pd.Series): row with tile metadata
        slide_path (str): path to slide
        otsu_threshold (float): otsu threshold value
    """
    with TiffSlide(slide_path) as slide:
        tile_arr = get_array_from_tile(tile, slide, 10)
    score = np.mean((rgb2gray(tile_arr) < otsu_threshold).astype(int))
    return score

compute_purple_score(tile, slide_path)

Return purple score for the tile. Args: row (pd.Series): row with tile metadata slide_url (str): path to slide

Source code in src/luna/pathology/cli/run_tissue_detection.py
def compute_purple_score(
    tile: Tile,
    slide_path: str,
) -> float:
    """
    Return purple score for the tile.
    Args:
        row (pd.Series): row with tile metadata
        slide_url (str): path to slide
    """
    with TiffSlide(slide_path) as slide:
        tile_arr = get_array_from_tile(tile, slide, 10)
    return get_purple_score(tile_arr)

compute_stain_score(tile, slide_path, vectors, channel, stain_threshold)

Returns stain score for the tile Args: row (pd.Series): row with tile metadata slide_url (str): path to slide vectors (np.ndarray): stain vectors channel (int): stain channel stain_threshold (float): stain threshold value

Source code in src/luna/pathology/cli/run_tissue_detection.py
def compute_stain_score(
    tile: Tile,
    slide_path: str,
    vectors,
    channel,
    stain_threshold: float,
) -> np.floating:
    """
    Returns stain score for the tile
    Args:
        row (pd.Series): row with tile metadata
        slide_url (str): path to slide
        vectors (np.ndarray): stain vectors
        channel (int): stain channel
        stain_threshold (float): stain threshold value
    """
    with TiffSlide(slide_path) as slide:
        tile_arr = get_array_from_tile(tile, slide, 10)
    stain = pull_stain_channel(tile_arr, vectors=vectors, channel=channel)
    score = np.mean(stain > stain_threshold)
    return score

detect_tissue(slide_manifest, tile_size=None, thumbnail_magnification=None, tile_magnification=None, filter_query='', batch_size=2000, force=True, storage_options={}, output_urlpath='.', output_storage_options={})

Run simple/deterministic tissue detection algorithms based on a filter query, to reduce tiles to those (likely) to contain actual tissue Args: slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl tile_size (int): size of tiles to use (at the requested magnification) thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection tile_magnification (Optional[int]): Magnification scale at which to generate tiles filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores batch_size (int): batch size for processing force (bool): overwite outputs if they exist storage_options (dict): storage options to pass to reading functions output_urlpath (str): Output url/path output_storage_options (dict): storage options to pass to writing functions Returns: DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/run_tissue_detection.py
def detect_tissue(
    slide_manifest: DataFrame[SlideSchema],
    tile_size: Optional[int] = None,
    thumbnail_magnification: Optional[int] = None,
    tile_magnification: Optional[int] = None,
    filter_query: str = "",
    batch_size: int = 2000,
    force: bool = True,
    storage_options: dict = {},
    output_urlpath: str = ".",
    output_storage_options: dict = {},
) -> DataFrame[SlideSchema]:
    """Run simple/deterministic tissue detection algorithms based on a filter query, to reduce tiles to those (likely) to contain actual tissue
    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        tile_size (int): size of tiles to use (at the requested magnification)
        thumbnail_magnification (Optional[int]): Magnification scale at which to create thumbnail for tissue detection
        tile_magnification (Optional[int]): Magnification scale at which to generate tiles
        filter_query (str): pandas query by which to filter tiles based on their various tissue detection scores
        batch_size (int): batch size for processing
        force (bool): overwite outputs if they exist
        storage_options (dict): storage options to pass to reading functions
        output_urlpath (str): Output url/path
        output_storage_options (dict): storage options to pass to writing functions
    Returns:
        DataFrame[SlideSchema]: slide manifest

    """
    client = get_or_create_dask_client()

    with make_temp_directory() as temp_dir:
        if "tiles_url" not in slide_manifest.columns:
            slide_manifest = generate_tiles(
                slide_manifest,
                tile_size,
                temp_dir,
                tile_magnification,
                storage_options,
            )

        futures = []
        for slide in slide_manifest.itertuples(name="Slide"):
            future = client.submit(
                __detect_tissue,
                slide.url,
                slide.tiles_url,
                slide.id,
                thumbnail_magnification,
                filter_query,
                batch_size,
                output_urlpath,
                force,
                storage_options,
                output_storage_options,
            )
            futures.append(future)
        progress(futures)

        results = client.gather(futures)

        slide_manifest = slide_manifest.assign(
            tiles_url=[x["tiles_url"] for x in results]
        )

    return slide_manifest

save_tiles

__save_tiles(tiles_urlpath, slide_urlpath, output_h5_path, batch_size=2000, storage_options={}, output_storage_options={})

Saves tiles to disk

Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), and the corresponding manifest/header file (tiles.parquet) is also generated

Parameters:

Name Type Description Default
tiles_urlpath str

tile manifest

required
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
output_urlpath str

output url/path

required
batch_size int

size in batch dimension to chuck jobs

2000
output_storage_options dict

storage options to writing functions

{}

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/save_tiles.py
@local_cache_urlpath(
    file_key_write_mode={
        "slide_urlpath": "r",
        "output_h5_path": "w",
    },
)
def __save_tiles(
    tiles_urlpath: str,
    slide_urlpath: str,
    output_h5_path: str,
    batch_size: int = 2000,
    storage_options: dict = {},
    output_storage_options: dict = {},
):
    """Saves tiles to disk

    Tiles addresses and arrays are saved as key-value pairs in (tiles.h5),
    and the corresponding manifest/header file (tiles.parquet) is also generated

    Args:
        tiles_urlpath (str): tile manifest
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        output_urlpath (str): output url/path
        batch_size (int): size in batch dimension to chuck jobs
        output_storage_options (dict): storage options to writing functions

    Returns:
        dict: metadata about function call
    """

    tiles_df = pd.read_parquet(tiles_urlpath, storage_options=storage_options)

    get_or_create_dask_client()

    def f_many(iterator):
        with TiffSlide(slide_urlpath) as slide:
            return [(x.address, get_array_from_tile(x, slide=slide)) for x in iterator]

    chunks = db.from_sequence(
        tiles_df.itertuples(name="Tile"), partition_size=batch_size
    )

    ProgressBar().register()
    results = chunks.map_partitions(f_many)
    with h5py.File(output_h5_path, "w") as hfile:
        for result in results.compute():
            address, tile_arr = result
            hfile.create_dataset(address, data=tile_arr)

    return tiles_df

cli(slide_urlpath='???', tiles_urlpath='???', batch_size=2000, output_urlpath='.', force=False, storage_options={}, output_storage_options={}, dask_options={}, local_config='')

Saves tiles to disk

Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), and the corresponding manifest/header file (tiles.parquet) is also generated

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

'???'
tiles_urlpath str

url/path to tile manifest (.parquet)

'???'
batch_size int

size in batch dimension to chuck jobs

2000
output_urlpath str

output url/path prefix

'.'
force bool

overwrite outputs if they exist

False
storage_options dict

storage options to reading functions

{}
output_storage_options dict

storage options to writing functions

{}
dask_options dict

dask options

{}
local_config str

url/path to local config yaml file

''

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/save_tiles.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    tiles_urlpath: str = "???",
    batch_size: int = 2000,
    output_urlpath: str = ".",
    force: bool = False,
    storage_options: dict = {},
    output_storage_options: dict = {},
    dask_options: dict = {},
    local_config: str = "",
):
    """Saves tiles to disk

    Tiles addresses and arrays are saved as key-value pairs in (tiles.h5),
    and the corresponding manifest/header file (tiles.parquet) is also generated

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        tiles_urlpath (str): url/path to tile manifest (.parquet)
        batch_size (int): size in batch dimension to chuck jobs
        output_urlpath (str): output url/path prefix
        force (bool): overwrite outputs if they exist
        storage_options (dict): storage options to reading functions
        output_storage_options (dict): storage options to writing functions
        dask_options (dict): dask options
        local_config (str): url/path to local config yaml file

    Returns:
        dict: metadata about function call
    """
    config = get_config(vars())

    configure_dask_client(**config["dask_options"])

    properties = _save_tiles(
        config["tiles_urlpath"],
        config["slide_urlpath"],
        config["output_urlpath"],
        config["force"],
        config["batch_size"],
        config["storage_options"],
        config["output_storage_options"],
    )

    return properties

save_tiles(slide_manifest, output_urlpath, force=True, batch_size=2000, storage_options={}, output_storage_options={})

Saves tiles to disk

Tiles addresses and arrays are saved as key-value pairs in (tiles.h5), and the corresponding manifest/header file (tiles.parquet) is also generated

Parameters:

Name Type Description Default
slide_manifest DataFrame[SlideSchema]

slide manifest from slide_etl

required
output_urlpath str

output url/path prefix

required
force bool

overwrite outputs if they exist

True
batch_size int

size in batch dimension to chuck jobs

2000
storage_options dict

storage options to reading functions

{}
output_storage_options dict

storage options to writing functions

{}

Returns:

Type Description
DataFrame[SlideSchema]

DataFrame[SlideSchema]: slide manifest

Source code in src/luna/pathology/cli/save_tiles.py
def save_tiles(
    slide_manifest: DataFrame[SlideSchema],
    output_urlpath: str,
    force: bool = True,
    batch_size: int = 2000,
    storage_options: dict = {},
    output_storage_options: dict = {},
) -> DataFrame[SlideSchema]:
    """Saves tiles to disk

    Tiles addresses and arrays are saved as key-value pairs in (tiles.h5),
    and the corresponding manifest/header file (tiles.parquet) is also generated

    Args:
        slide_manifest (DataFrame[SlideSchema]): slide manifest from slide_etl
        output_urlpath (str): output url/path prefix
        force (bool): overwrite outputs if they exist
        batch_size (int): size in batch dimension to chuck jobs
        storage_options (dict): storage options to reading functions
        output_storage_options (dict): storage options to writing functions

    Returns:
        DataFrame[SlideSchema]: slide manifest
    """
    client = get_or_create_dask_client()

    if "tiles_url" not in slide_manifest.columns:
        raise ValueError("Generate tiles first")

    output_filesystem, output_path_prefix = fsspec.core.url_to_fs(
        output_urlpath, **output_storage_options
    )

    if not output_filesystem.exists(output_urlpath):
        output_filesystem.mkdir(output_urlpath)

    futures = []
    for slide in slide_manifest.itertuples(name="Slide"):
        future = client.submit(
            _save_tiles,
            slide.tiles_url,
            slide.url,
            output_urlpath,
            force,
            batch_size,
            storage_options,
            output_storage_options,
        )
        futures.append(future)

    results = client.gather(futures)
    return slide_manifest.assign(tiles_url=[x["tiles_url"] for x in results])

slide_etl

SlideBuilder

Source code in src/luna/pathology/cli/slide_etl.py
class SlideBuilder:
    def __init__(self, storage_options: dict = {}, output_storage_options: dict = {}):
        self.storage_options = storage_options
        self.output_storage_options = output_storage_options

    def __generate_properties(self, slide, url):
        with open(url, **self.storage_options) as f:
            s = TiffSlide(f)
            slide.properties = s.properties
            try:
                to_mag_scale_factor = get_scale_factor_at_magnification(
                    s, requested_magnification=1
                )
                sample_arr = get_downscaled_thumbnail(s, to_mag_scale_factor)
                stain_vectors = get_stain_vectors_macenko(sample_arr)
                slide.channel0_R = stain_vectors[0, 0]
                slide.channel0_G = stain_vectors[0, 1]
                slide.channel0_B = stain_vectors[0, 2]
                slide.channel1_R = stain_vectors[1, 0]
                slide.channel1_G = stain_vectors[1, 1]
                slide.channel1_B = stain_vectors[1, 2]
            except Exception as err:
                logger.warning(f"Couldn't get stain vectors: {url} - {err}")

    def copy_slide(self, slide, output_urlpath, chunksize=50000000):
        new_slide = slide.copy()
        name = Path(slide.url).name
        fs, output_path = fsspec.core.url_to_fs(
            output_urlpath, **self.output_storage_options
        )
        p = Path(output_path) / name
        with open(slide.url, "rb", **self.storage_options) as f1:
            with fs.open(p, "wb") as f2:
                while True:
                    data = f1.read(chunksize)
                    if not data:
                        break
                    f2.write(data)
        new_slide.url = fs.unstrip_protocol(str(p))
        return new_slide

    def get_slide(self, url, project_name="", comment="") -> Slide:
        """Extract openslide properties and write slide to storage location

        Args:
            path (string): path to slide image

        Returns:
            slide (Slide): slide object
        """

        fs, path = fsspec.core.url_to_fs(url, **self.storage_options)

        id = Path(path).stem
        size = fs.du(path)
        slide = Slide(
            id=id,
            project_name=project_name,
            comment=comment,
            slide_size=size,
            url=url,
            uuid=str(uuid.uuid3(uuid.NAMESPACE_URL, url)),
        )

        self.__generate_properties(slide, url)

        return slide
get_slide(url, project_name='', comment='')

Extract openslide properties and write slide to storage location

Parameters:

Name Type Description Default
path string

path to slide image

required

Returns:

Name Type Description
slide Slide

slide object

Source code in src/luna/pathology/cli/slide_etl.py
def get_slide(self, url, project_name="", comment="") -> Slide:
    """Extract openslide properties and write slide to storage location

    Args:
        path (string): path to slide image

    Returns:
        slide (Slide): slide object
    """

    fs, path = fsspec.core.url_to_fs(url, **self.storage_options)

    id = Path(path).stem
    size = fs.du(path)
    slide = Slide(
        id=id,
        project_name=project_name,
        comment=comment,
        slide_size=size,
        url=url,
        uuid=str(uuid.uuid3(uuid.NAMESPACE_URL, url)),
    )

    self.__generate_properties(slide, url)

    return slide

cli(slide_urlpath='???', project_name='', comment='', subset_csv_urlpath='', debug_limit=0, output_urlpath='', storage_options={}, output_storage_options={}, local_config='', no_copy=False, metadata_extension='parquet')

Ingest slide by adding them to a file or s3 based storage location and generating metadata about them

Parameters:

Name Type Description Default
slide_urlpath str

path to slide image

'???'
project_name str

project name underwhich the slides should reside

''
comment str

comment and description of dataset

''
subset_csv_urlpath str

url/path to subset csv

''
debug_limit int

limit number of slides

0
output_urlpath str

url/path to output table

''
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
local_config str

url/path to YAML config file

''
no_copy bool

determines whether we copy slides to output_urlpath

False
metadata_extension(str)

file extension of generated metadata file (either 'csv' or 'parquet')

required
Source code in src/luna/pathology/cli/slide_etl.py
@timed
def cli(
    slide_urlpath: str = "???",
    project_name: str = "",
    comment: str = "",
    subset_csv_urlpath: str = "",
    debug_limit: int = 0,
    output_urlpath: str = "",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
    no_copy: bool = False,
    metadata_extension: str = "parquet",
):
    """Ingest slide by adding them to a file or s3 based storage location and generating metadata about them


    Args:
        slide_urlpath (str): path to slide image
        project_name (str): project name underwhich the slides should reside
        comment (str): comment and description of dataset
        subset_csv_urlpath (str): url/path to subset csv
        debug_limit (int): limit number of slides
        output_urlpath (str): url/path to output table
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): url/path to YAML config file
        no_copy (bool): determines whether we copy slides to output_urlpath
        metadata_extension(str): file extension of generated metadata file (either 'csv' or 'parquet')
    """

    config = get_config(vars())
    filesystem, slide_path = fsspec.core.url_to_fs(
        config["slide_urlpath"], **config["storage_options"]
    )
    slide_paths = []  # type: list[str]
    if any([slide_path.endswith(ext) for ext in VALID_SLIDE_EXTENSIONS]):
        slide_paths += slide_path
    else:
        for ext in VALID_SLIDE_EXTENSIONS:
            slide_paths += filesystem.glob(f"{slide_path}/*{ext}")

    if config["metadata_extension"]:
        extension = config["metadata_extension"].lower().replace(".", "")

    if config["subset_csv_urlpath"]:
        slide_paths = apply_csv_filter(
            slide_paths, config["subset_csv_urlpath"], config["storage_options"]
        )
    if config["debug_limit"] > 0:
        slide_paths = slide_paths[: config["debug_limit"]]

    configure_dask_client()

    if len(slide_paths) == 0:
        return None

    slide_urls = [filesystem.unstrip_protocol(slide_path) for slide_path in slide_paths]

    df = slide_etl(
        slide_urls,
        config["project_name"],
        config["comment"],
        config["storage_options"],
        config["output_urlpath"],
        config["output_storage_options"],
        config["no_copy"],
    )

    logger.info(df)
    if config["output_urlpath"]:
        output_filesystem, output_path = fsspec.core.url_to_fs(
            config["output_urlpath"], **config["output_storage_options"]
        )

        f = Path(output_path) / f"slide_ingest_{config['project_name']}.{extension}"
        with output_filesystem.open(f, "wb") as of:
            if extension == "csv":
                logger.info("Writing to csv file")
                df.to_csv(of)
            elif extension == "parquet":
                logger.info("Writing to parquet file")
                df.to_parquet(of)

slide_etl(slide_urls, project_name, comment='', storage_options={}, output_urlpath='', output_storage_options={}, no_copy=False)

Ingest slides by adding them to a file or s3 based storage location and generating metadata about them

Parameters:

Name Type Description Default
slide_urls Union[str, List[str]

path to slide image(s)

required
project_name str

project name underwhich the slides should reside

required
comment str

comment and description of dataset

''
storage_options dict

storage options to pass to reading functions

{}
output_urlpath str

url/path to output table

''
output_storage_options dict

storage options to pass to writing functions

{}
no_copy bool

do not copy slides to output path

False

Returns:

Type Description
DataFrame

DataFrame[SlideSchema]: dataframe containing the metadata of all the slides

Source code in src/luna/pathology/cli/slide_etl.py
def slide_etl(
    slide_urls: Union[str, List[str]],
    project_name: str,
    comment: str = "",
    storage_options: dict = {},
    output_urlpath: str = "",
    output_storage_options: dict = {},
    no_copy: bool = False,
) -> DataFrame:
    """Ingest slides by adding them to a file or s3 based storage location and generating metadata about them

    Args:
        slide_urls (Union[str, List[str])): path to slide image(s)
        project_name (str): project name underwhich the slides should reside
        comment (str): comment and description of dataset
        storage_options (dict): storage options to pass to reading functions
        output_urlpath (str): url/path to output table
        output_storage_options (dict): storage options to pass to writing functions
        no_copy (bool): do not copy slides to output path


    Returns:
        DataFrame[SlideSchema]: dataframe containing the metadata of all the slides
    """
    sb = SlideBuilder(storage_options, output_storage_options=output_storage_options)
    if isinstance(slide_urls, str):
        return __slide_etl(
            sb, slide_urls, project_name, comment, output_urlpath, no_copy
        )

    client = get_or_create_dask_client()

    futures = [
        client.submit(
            __slide_etl,
            sb,
            slide_url,
            project_name,
            comment,
            output_urlpath,
            no_copy,
        )
        for slide_url in slide_urls
    ]
    progress(futures)
    dfs = client.gather(futures)
    return pd.concat(dfs)

visualize_tile_labels_png

cli(slide_urlpath='???', tiles_urlpath='', mpp_units=False, plot_labels='???', output_urlpath='.', requested_magnification=None, tile_size=None, storage_options={}, output_storage_options={}, local_config='')

Generate nice tile markup images with continuous or discrete tile scores

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

'???'
tiles_urlpath str

url/path to a slide-tile manifest file (.tiles.csv)

''
mpp_units bool

if true, additional rescaling is applied to match micro-meter and pixel coordinate systems

False
plot_labels List[str]

labels to plot

'???'
output_urlpath str

output url/path prefix

'.'
requested_magnification int

Magnification scale at which to perform computation

None
tile_size int

tile size

None
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
local_config str

url/path to local config YAML file

''

Returns:

Name Type Description
dict

metadata about function call

Source code in src/luna/pathology/cli/visualize_tile_labels_png.py
@timed
@save_metadata
def cli(
    slide_urlpath: str = "???",
    tiles_urlpath: str = "",
    mpp_units: bool = False,
    plot_labels: List[str] = "???",  # type: ignore
    output_urlpath: str = ".",
    requested_magnification: Optional[int] = None,
    tile_size: Optional[int] = None,
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
):
    """Generate nice tile markup images with continuous or discrete tile scores

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        tiles_urlpath (str): url/path to a slide-tile manifest file (.tiles.csv)
        mpp_units (bool): if true, additional rescaling is applied to match micro-meter and pixel coordinate systems
        plot_labels (List[str]): labels to plot
        output_urlpath (str): output url/path prefix
        requested_magnification (int): Magnification scale at which to perform computation
        tile_size (int): tile size
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): url/path to local config YAML file

    Returns:
        dict: metadata about function call
    """
    config = get_config(vars())

    if not config["tile_size"] and not config["tiles_urlpath"]:
        raise fire.core.FireError("Specify either tiles_urlpath or tile_size")

    thumbnails_overlayed = visualize_tiles(
        config["slide_urlpath"],
        config["tiles_urlpath"],
        config["mpp_units"],
        config["plot_labels"],
        config["requested_magnification"],
        config["tile_size"],
        config["storage_options"],
    )

    fs, output_path_prefix = fsspec.core.url_to_fs(
        config["output_urlpath"], **config["output_storage_options"]
    )

    images = {}
    for score_type, thumbnail_overlayed in thumbnails_overlayed.items():
        output_file = (
            Path(output_path_prefix)
            / f"tile_scores_and_labels_visualization_{score_type}.png"
        )
        thumbnail_overlayed = Image.fromarray(thumbnail_overlayed)
        with fs.open(output_file, "wb") as of:
            thumbnail_overlayed.save(of, format="PNG")
        images[score_type] = str(output_file)
        logger.info(f"Saved {score_type} visualization at {output_file}")

    properties = {
        "data": fs.unstrip_protocol(output_path_prefix),
        "images": images,
    }

    return properties

visualize_tiles(slide_urlpath, tiles_urlpath, mpp_units, plot_labels, requested_magnification=None, tile_size=None, storage_options={})

Generate nice tile markup images with continuous or discrete tile scores

Parameters:

Name Type Description Default
slide_urlpath str

url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)

required
tiles_urlpath str

url/path to a slide-tile manifest file (.tiles.csv)

required
mpp_units bool

if true, additional rescaling is applied to match micro-meter and pixel coordinate systems

required
plot_labels List[str]

labels to plot

required
requested_magnification int

Magnification scale at which to perform computation

None
tile_size int

tile size

None
storage_options dict

storage options to pass to reading functions

{}

Returns:

Type Description

dict[str,np.ndarray]: score type to numpy array representation of overlayed thumbnail

Source code in src/luna/pathology/cli/visualize_tile_labels_png.py
def visualize_tiles(
    slide_urlpath: str,
    tiles_urlpath: str,
    mpp_units: bool,
    plot_labels: List[str],
    requested_magnification: Optional[int] = None,
    tile_size: Optional[int] = None,
    storage_options: dict = {},
):
    """Generate nice tile markup images with continuous or discrete tile scores

    Args:
        slide_urlpath (str): url/path to slide image (virtual slide formats compatible with openslide, .svs, .tif, .scn, ...)
        tiles_urlpath (str): url/path to a slide-tile manifest file (.tiles.csv)
        mpp_units (bool): if true, additional rescaling is applied to match micro-meter and pixel coordinate systems
        plot_labels (List[str]): labels to plot
        requested_magnification (int): Magnification scale at which to perform computation
        tile_size (int): tile size
        storage_options (dict): storage options to pass to reading functions

    Returns:
        dict[str,np.ndarray]: score type to numpy array representation of overlayed thumbnail
    """
    if type(plot_labels) == str:
        plot_labels = [plot_labels]

    # Get tiles
    if tiles_urlpath:
        with open(tiles_urlpath, **storage_options) as of:
            df = pd.read_parquet(of).reset_index().set_index("address")
    elif type(tile_size) == int:
        df = generate_tiles(
            slide_urlpath, tile_size, storage_options, requested_magnification
        )
    else:
        raise RuntimeError("Specify tile size or url/path to tiling data")

    with open(slide_urlpath, **storage_options) as of:
        slide = tiffslide.TiffSlide(of)

        to_mag_scale_factor = get_scale_factor_at_magnification(
            slide, requested_magnification=requested_magnification
        )

        # Create thumbnail image for scoring
        sample_arr = get_downscaled_thumbnail(slide, to_mag_scale_factor)

        # See if we need to adjust scale_factor to account for different units
        if mpp_units:
            unit_sf = 0.0
            for mpp_key in ("aperio.MPP", "openslide.mpp-x"):
                if mpp_key in slide.properties:
                    unit_sf = float(slide.properties[mpp_key])
            if unit_sf:
                to_mag_scale_factor *= unit_sf
            else:
                logger.warning(
                    "No MPP scale factor was recognized in slide properties."
                )

    # only visualize tile scores that were able to be computed
    all_score_types = set(plot_labels)
    score_types_to_visualize = set(list(df.columns)).intersection(all_score_types)

    thumbnails_overlayed = {}  # type: Dict[str,np.ndarray]
    for score_type in score_types_to_visualize:
        thumbnails_overlayed[score_type] = visualize_tiling_scores(
            df, sample_arr, to_mag_scale_factor, score_type
        )

    return thumbnails_overlayed

common

annotation_utils

check_slideviewer_and_download_bmp(sv_project_id, slideviewer_path, slide_id, users, SLIDE_BMP_DIR, SLIDEVIEWER_API_URL, TMP_ZIP_DIR)

download bitmap annotation from slideviwer

Parameters:

Name Type Description Default
sv_project_id str

slideviewer project id

required
slideviewer_path str

filepath to the input slide

required
slide_id str

slide id

required
users List[str]

list of users who provided annotations

required
SLIDE_BMP_DIR str

output folder to save bitmap to

required
SLIDEVIEWER_API_URL str

API url for slide viewer

required

Returns:

Type Description
Union[None, List]

Union[None, List]: returns none if there are no annotations to process, or returns a list containing output parameters

Source code in src/luna/pathology/common/annotation_utils.py
def check_slideviewer_and_download_bmp(
    sv_project_id: str,
    slideviewer_path: str,
    slide_id: str,
    users: List,
    SLIDE_BMP_DIR: str,
    SLIDEVIEWER_API_URL: str,
    TMP_ZIP_DIR: str,
) -> Union[None, List]:
    """download bitmap annotation from slideviwer

    Args:
        sv_project_id (str): slideviewer project id
        slideviewer_path (str): filepath to the input slide
        slide_id (str): slide id
        users (List[str]): list of users who provided annotations
        SLIDE_BMP_DIR (str): output folder to save bitmap to
        SLIDEVIEWER_API_URL (str): API url for slide viewer
        TMP_ZIP_DIR (str) temporary directory to save ziped bitmap files to

    Returns:
        Union[None, List]: returns none if there are no annotations to process, or
            returns a list containing output parameters
    """
    slide_id = str(slide_id)

    outputs = []
    output_dict_base = {
        "sv_project_id": sv_project_id,
        "slideviewer_path": slideviewer_path,
        "slide_id": slide_id,
        "user": "n/a",
        "bmp_filepath": "n/a",
        "npy_filepath": "n/a",
        "geojson": "n/a",
        "geojson_path": "n/a",
        "date": datetime.now(),
    }
    outputs.append(output_dict_base)

    for user in users:
        # download bitmap
        bmp_record_uuid, bmp_filepath = get_slide_bitmap(
            slideviewer_path,
            user,
            slide_id,
            SLIDE_BMP_DIR,
            SLIDEVIEWER_API_URL,
            TMP_ZIP_DIR,
            sv_project_id,
        )
        # convert to npy
        if bmp_record_uuid != "n/a" or bmp_filepath != "n/a":

            output_dict = copy.deepcopy(output_dict_base)
            output_dict["user"] = user
            output_dict["bmp_filepath"] = bmp_filepath
            outputs.append(output_dict)
    # at this point if outputs is empty, return early
    if len(outputs) <= 1:
        return None
    else:
        return outputs

convert_bmp_to_npy(bmp_file, output_folder)

convert bitmap to numpy

Reads a bmp file and creates friendly numpy ndarray file in the uint8 format in the output directory specified, with extention .annot.npy

Troubleshooting

Make sure Pillow is upgraded to version 8.0.0 if getting an Unsupported BMP Size OS Error

Parameters:

Name Type Description Default
bmp_file str

path to .bmp image

required
output_folder str

path to output folder

required

Returns str: filepath to file containing numpy array

Source code in src/luna/pathology/common/annotation_utils.py
def convert_bmp_to_npy(bmp_file: str, output_folder: str) -> str:
    """convert bitmap to numpy

    Reads a bmp file and creates friendly numpy ndarray file in the uint8 format in the output
    directory specified, with extention .annot.npy

    Troubleshooting:
        Make sure Pillow is upgraded to version 8.0.0 if getting an Unsupported BMP Size OS Error

    Args:
        bmp_file (str): path to .bmp image
        output_folder (str): path to output folder

    Returns
        str: filepath to file containing numpy array
    """
    Image.MAX_IMAGE_PIXELS = None

    if ".bmp" not in bmp_file:
        return ""

    new_image_name = os.path.basename(bmp_file).replace(".bmp", ".npy")
    bmp_caseid_folder = os.path.basename(os.path.dirname(bmp_file))
    output_caseid_folder = os.path.join(output_folder, bmp_caseid_folder)

    if not os.path.exists(output_caseid_folder):
        os.makedirs(output_caseid_folder)

    output_filepath = os.path.join(output_caseid_folder, new_image_name)

    np.save(output_filepath, np.array(Image.open(bmp_file)))
    return output_filepath

convert_slide_bitmap_to_geojson(outputs, all_labelsets, contour_level, SLIDE_NPY_DIR, slide_store_dir)

convert slide bitmap to geoJSON

Parameters:

Name Type Description Default
outputs List[dict]

list of output parameter dict

required
all_labelsets List[dict]

a list of dictionaries containing label sets

required
contour_level float

value along which to find contours

required
SLIDE_NPY_DIR str

directory containing the slide saved as a .npy

required
slide_store_dir str

directory of the datastore

required

Returns:

Type Description
Tuple[str, List]

Tuple[str, List]: a pair of slide id and output geojson tables

Source code in src/luna/pathology/common/annotation_utils.py
def convert_slide_bitmap_to_geojson(
    outputs,
    all_labelsets: List[dict],
    contour_level: float,
    SLIDE_NPY_DIR: str,
    slide_store_dir: str,
) -> Tuple[str, List]:
    """convert slide bitmap to geoJSON

    Args:
        outputs (List[dict]): list of output parameter dict
        all_labelsets (List[dict]): a list of dictionaries containing label sets
        contour_level (float): value along which to find contours
        SLIDE_NPY_DIR (str): directory containing the slide saved as a .npy
        slide_store_dir (str): directory of the datastore

    Returns:
        Tuple[str, List]: a pair of slide id and output geojson tables
    """

    import warnings

    warnings.warn("convert_slide_bitmap_to_geojson() is currently depreciated!")
    return None

get_slide_bitmap(full_filename, user, slide_id, SLIDE_BMP_DIR, SLIDEVIEWER_API_URL, TMP_ZIP_DIR, sv_project_id)

get slide bitmap

Parameters:

Name Type Description Default
full_filename str

filename of input slide

required
user str

name of pathologist/annotater who labled the input slide

required
SLIDE_BMP_DIR str

output folder to save bitmap to

required
SLIDEVIEWER_API_URL str

API url for slide viewer

required
sv_project_id str

slide viewer project id

required

Returns:

Type Description
Tuple[str, str]

Tuple[str, str]: a tuple of the bitmap record uuid and filepath to saved bitmap

Source code in src/luna/pathology/common/annotation_utils.py
def get_slide_bitmap(
    full_filename: str,
    user: str,
    slide_id: str,
    SLIDE_BMP_DIR: str,
    SLIDEVIEWER_API_URL: str,
    TMP_ZIP_DIR: str,
    sv_project_id: str,
) -> Tuple[str, str]:
    """get slide bitmap

    Args:
        full_filename (str): filename of input slide
        user (str): name of pathologist/annotater who labled the input slide
        SLIDE_BMP_DIR (str): output folder to save bitmap to
        SLIDEVIEWER_API_URL (str): API url for slide viewer
        TMP_ZIP_DIR (str) temporary directory to save ziped bitmap files to
        sv_project_id (str): slide viewer project id

    Returns:
        Tuple[str, str]: a tuple of the bitmap record uuid and filepath to saved bitmap
    """

    full_filename_without_ext = full_filename.replace(".svs", "")

    bmp_dirname = os.path.join(
        SLIDE_BMP_DIR, full_filename_without_ext.replace(";", "_")
    )
    bmp_dest_path = os.path.join(bmp_dirname, str(slide_id) + "_" + user + "_annot.bmp")

    if os.path.exists(bmp_dest_path):
        logger.debug("Removing temporary file " + bmp_dest_path)
        os.remove(bmp_dest_path)

    # download bitmap file using api (from brush and fill tool), download zips into TMP_ZIP_DIR
    os.makedirs(TMP_ZIP_DIR, exist_ok=True)
    zipfile_path = os.path.join(
        TMP_ZIP_DIR, full_filename_without_ext + "_" + user + ".zip"
    )

    url = (
        SLIDEVIEWER_API_URL
        + "slides/"
        + str(user)
        + "@mskcc.org/projects;"
        + str(sv_project_id)
        + ";"
        + full_filename
        + "/getLabelFileBMP"
    )

    logger.debug(f"Pulling from Slideviewer URL={url}")

    success = download_zip(url, zipfile_path)

    bmp_record_uuid = "n/a"
    bmp_filepath = "n/a"

    if not success:
        os.remove(zipfile_path)
        return (bmp_record_uuid, bmp_filepath)

    unzipped_file_descriptor = unzip(zipfile_path)

    if unzipped_file_descriptor is None:
        return (bmp_record_uuid, bmp_filepath)

    # create bmp file from unzipped file
    os.makedirs(os.path.dirname(bmp_dest_path), exist_ok=True)
    with open(bmp_dest_path, "wb") as ff:
        ff.write(
            unzipped_file_descriptor.read("labels.bmp")
        )  # all bmps from slideviewer are called labels.bmp

    logger.info(
        "Added slide " + str(slide_id) + " to " + str(bmp_dest_path) + "  * * * * "
    )

    bmp_hash = FileHash("sha256").hash_file(bmp_dest_path)
    bmp_record_uuid = f"SVBMP-{bmp_hash}"
    bmp_filepath = (
        bmp_dirname + "/" + slide_id + "_" + user + "_" + bmp_record_uuid + "_annot.bmp"
    )
    os.rename(bmp_dest_path, bmp_filepath)

    # cleanup
    if os.path.exists(zipfile_path):
        os.remove(zipfile_path)

    return (bmp_record_uuid, bmp_filepath)

build_geojson

add_contours_for_label(annotation_geojson, annotation, label_num, mappings, contour_level)

creates geoJSON feature dictionary for labels

Finds the contours for a label mask, builds a polygon and then converts the polygon to geoJSON feature dictionary

Parameters:

Name Type Description Default
annotation_geojson dict[str, any]

geoJSON result to populate

required
annotation ndarray

npy array of bitmap

required
label_num int

the integer cooresponding to the annotated label

required
mappings dict

label map for specified label set

required
contour_level float

value along which to find contours in the array

required

Returns:

Type Description
Dict[str, any]

dict[str, any]: geoJSON with label countours

Source code in src/luna/pathology/common/build_geojson.py
def add_contours_for_label(
    annotation_geojson: Dict[str, any],
    annotation: np.ndarray,
    label_num: int,
    mappings: dict,
    contour_level: float,
) -> Dict[str, any]:
    """creates geoJSON feature dictionary for labels

    Finds the contours for a label mask, builds a polygon and then converts the polygon
    to geoJSON feature dictionary

    Args:
        annotation_geojson (dict[str, any]): geoJSON result to populate
        annotation (np.ndarray): npy array of bitmap
        label_num (int): the integer cooresponding to the annotated label
        mappings (dict): label map for specified label set
        contour_level (float): value along which to find contours in the array

    Returns:
         dict[str, any]: geoJSON with label countours
    """

    if label_num in annotation:
        print("Building contours for label " + str(label_num))

        num_pixels = np.count_nonzero(annotation == label_num)
        print("num_pixels with label", num_pixels)

        mask = np.where(annotation == label_num, 1, 0).astype(np.int8)
        contours = measure.find_contours(mask, level=contour_level)
        print("num_contours", len(contours))

        polygons = [Polygon(np.squeeze(c)) for c in contours]
        parent_nums = find_parents(polygons)

        polygon_by_index_number = {}

        for index, parent in enumerate(parent_nums):
            contour = contours[index]
            contour_list = contour.tolist()

            # switch coordinates, otherwise gets flipped
            for coord in contour_list:
                x = int(coord[0])
                y = int(coord[1])
                coord[0] = y
                coord[1] = x

            # this polygon does not have parent, so this is a parent object (top level)
            if parent == -1:
                polygon = {
                    "type": "Feature",
                    "properties": {},
                    "geometry": {"type": "Polygon", "coordinates": []},
                }
                polygon["properties"]["label_num"] = int(label_num)
                polygon["properties"]["label_name"] = mappings[label_num]
                polygon["geometry"]["coordinates"].append(contour_list)
                polygon_by_index_number[index] = polygon
            else:
                # this is a child object, add coordinates as a hole to the parent polygon

                # fetch parent's polygon
                parent_polygon = polygon_by_index_number[parent]

                # append as hole to parent
                parent_polygon["geometry"]["coordinates"].append(contour_list)

        # add parent polygon feature dicts to running annotation geojson object
        for index, polygon in polygon_by_index_number.items():
            annotation_geojson["features"].append(polygon)

    else:
        print("No label " + str(label_num) + " found")

    return annotation_geojson

build_all_geojsons_from_default(default_annotation_geojson, all_labelsets, contour_level)

builds geoJSON objects from a set of labels

wraps build_labelset_specific_geojson with logic to generate annotations from multiple labelsets

Parameters:

Name Type Description Default
default_annotation_geojson dict[str, any]

input geoJSON

required
all_labelsets list[dict]

a list of dictionaries containing label sets

required
contour_level float

value along which to find contours

required

Returns:

Name Type Description
dict dict

a dictionary with labelset name and cooresponding geoJSON as key, value

dict

pairs

Source code in src/luna/pathology/common/build_geojson.py
def build_all_geojsons_from_default(
    default_annotation_geojson: Dict[str, any],
    all_labelsets: List[dict],
    contour_level: float,
) -> dict:
    """builds geoJSON objects from a set of labels

    wraps build_labelset_specific_geojson with logic to generate annotations
    from multiple labelsets

    Args:
        default_annotation_geojson (dict[str, any]): input geoJSON
        all_labelsets (list[dict]): a list of dictionaries containing label sets
        contour_level (float):  value along which to find contours

    Returns:
        dict: a dictionary with labelset name and cooresponding geoJSON as key, value
        pairs

    """

    labelset_name_to_labelset_specific_geojson = {}

    for labelset_name, labelset in all_labelsets.items():
        if labelset_name != DEFAULT_LABELSET_NAME:
            # use default labelset geojson to build labelset specific geojson
            annotation_geojson = build_labelset_specific_geojson(
                default_annotation_geojson, labelset
            )
        else:
            annotation_geojson = default_annotation_geojson

        # only add if geojson not none (built correctly and contains >= 1 polygon)
        if annotation_geojson:
            labelset_name_to_labelset_specific_geojson[labelset_name] = json.dumps(
                annotation_geojson
            )

    return labelset_name_to_labelset_specific_geojson

build_default_geojson_from_annotation(annotation_npy_filepath, all_labelsets, contour_level)

builds geoJSONS from numpy annotation with default label set

Parameters:

Name Type Description Default
annotation_npy_filepath str

string to numpy annotation

required
all_labelsets dict

a dictionary of label sets

required
contour_level float

value along which to find contours

required

Returns:

Type Description

dict[str, any]: the default geoJSON annotation

Source code in src/luna/pathology/common/build_geojson.py
def build_default_geojson_from_annotation(
    annotation_npy_filepath: str, all_labelsets: dict, contour_level: float
):
    """builds geoJSONS from numpy annotation with default label set

    Args:
        annotation_npy_filepath (str): string to numpy annotation
        all_labelsets (dict): a dictionary of label sets
        contour_level (float):  value along which to find contours

    Returns:
        dict[str, any]: the default geoJSON annotation
    """

    annotation = np.load(annotation_npy_filepath)
    default_annotation_geojson = copy.deepcopy(geojson_base)

    # signal logic doesn't work in dask distributed setup

    default_labelset = all_labelsets[DEFAULT_LABELSET_NAME]

    if not (annotation > 0).any():
        print(
            f"No annotated pixels detected in bitmap loaded from {annotation_npy_filepath}"
        )
        return None

    # vectorize all
    for label_num in default_labelset:
        default_annotation_geojson = add_contours_for_label(
            default_annotation_geojson,
            annotation,
            label_num,
            default_labelset,
            float(contour_level),
        )

    # empty geojson created, return nan and delete from geojson table
    if len(default_annotation_geojson["features"]) == 0:
        print(
            f"Something went wrong with building default geojson from {annotation_npy_filepath}, quitting"
        )
        return None

    return default_annotation_geojson

build_geojson_from_annotation(df)

Builds geoJSON for all annotation labels in the specified labelset.

Parameters:

Name Type Description Default
df DataFrame

input regional annotation table

required

Returns:

Name Type Description
pandasDataFrame DataFrame

dataframe with geoJSON field poopulated

Source code in src/luna/pathology/common/build_geojson.py
def build_geojson_from_annotation(df: pd.DataFrame) -> pd.DataFrame:
    """Builds geoJSON for all annotation labels in the specified labelset.

    Args:
        df (pandas.DataFrame): input regional annotation table

    Returns:
        pandasDataFrame: dataframe with geoJSON field poopulated
    """

    labelsets = df.label_config.values[0]
    annotation_npy_filepath = df.npy_filepath.values[0]
    labelset = df.labelset.values[0]
    contour_level = df.contour_level.values[0]

    labelsets = ast.literal_eval(labelsets)
    mappings = labelsets[labelset]

    print("\nBuilding GeoJSON annotation from npy file:", annotation_npy_filepath)

    annotation = np.load(annotation_npy_filepath)
    annotation_geojson = copy.deepcopy(geojson_base)

    signal.signal(signal.SIGALRM, handler)
    signal.alarm(TIMEOUT_SECONDS)

    try:
        for label_num in mappings:
            annotation_geojson = add_contours_for_label(
                annotation_geojson,
                annotation,
                label_num,
                mappings,
                float(contour_level),
            )
    except TimeoutError as err:
        print(
            "Timeout Error occured while building geojson from slide",
            annotation_npy_filepath,
        )
        raise err

    # disables alarm
    signal.alarm(0)

    # empty geojson created, return nan and delete from geojson table
    if len(annotation_geojson["features"]) == 0:
        return df

    df["geojson"] = json.dumps(annotation_geojson)
    return df

build_geojson_from_pointclick_json(labelsets, labelset, sv_json)

Build geoJSON m slideviewer JSON

This method extracts point annotations from a slideviwer json object and converts them to a standardized geoJSON format

Parameters:

Name Type Description Default
labelsets dict

dictionary of label set as string (e.g. {labelset: {label_number: label_name}})

required
labelset str

the name of the labelset e.g. default_labels

required
sv_json list[dict]

annotatations from slideviwer in the form of a list of dictionaries

required

Returns:

Name Type Description
list list

a list of geoJSON annotation objects

Source code in src/luna/pathology/common/build_geojson.py
def build_geojson_from_pointclick_json(
    labelsets: dict, labelset: str, sv_json: List[dict]
) -> list:
    """Build geoJSON m slideviewer JSON

    This method extracts point annotations from a slideviwer json object and
    converts them to a standardized geoJSON format

    Args:
        labelsets (dict): dictionary of label set as string (e.g. {labelset:
            {label_number: label_name}})
        labelset (str): the name of the labelset e.g. default_labels
        sv_json (list[dict]): annotatations from slideviwer in the form of a list of dictionaries

    Returns:
        list: a list of geoJSON annotation objects
    """

    labelsets = ast.literal_eval(labelsets)
    mappings = labelsets[labelset]

    output_geojson = []
    for entry in sv_json:
        point = {}
        x = int(entry["x"])
        y = int(entry["y"])
        class_num = int(entry["class"])
        if class_num not in mappings:
            continue
        class_name = mappings[class_num]
        coordinates = [x, y]

        point["type"] = "Feature"
        point["id"] = "PathAnnotationObject"
        point["geometry"] = {"type": "Point", "coordinates": coordinates}
        point["properties"] = {"classification": {"name": class_name}}
        output_geojson.append(point)

    return output_geojson

build_labelset_specific_geojson(default_annotation_geojson, labelset)

builds geoJSON for labelset

Instead of working with a large geJSON object, you can extact polygons that coorspond to specific labels into a smaller object.

Parameters:

Name Type Description Default
default_annotation_geojson dict[str, any]

geoJSON annotation file

required
labelset dict

label set dictionary

required

Returns:

Type Description
Dict[str, any]

dict[str, any]: geoJSON with only polygons from provided labelset

Source code in src/luna/pathology/common/build_geojson.py
def build_labelset_specific_geojson(
    default_annotation_geojson: Dict[str, any], labelset: dict
) -> Dict[str, any]:
    """builds geoJSON for labelset

    Instead of working with a large geJSON object, you can extact polygons
    that coorspond to specific labels into a smaller object.

    Args:
        default_annotation_geojson (dict[str, any]):  geoJSON annotation file
        labelset (dict): label set dictionary

    Returns:
        dict[str, any]: geoJSON with only polygons from provided labelset
    """

    annotation_geojson = copy.deepcopy(geojson_base)

    for feature in default_annotation_geojson["features"]:

        # number is fixed
        label_num = feature["properties"]["label_num"]
        # add polygon to json, change name potentially needed
        if label_num in labelset:
            new_feature_polygon = copy.deepcopy(feature)

            # get new name and change
            new_label_name = labelset[label_num]
            new_feature_polygon["properties"]["label_name"] = new_label_name

            # add to annotation_geojson being built
            annotation_geojson["features"].append(new_feature_polygon)

    # no polygons containing labels in labelset
    if len(annotation_geojson["features"]) == 0:
        return None

    return annotation_geojson

concatenate_regional_geojsons(geojson_list)

concatenate regional annotations

Concatenates geojsons if there are more than one annotations for the labelset.

Parameters:

Name Type Description Default
geojson_list list[dict[str, any]]

list of geoJSON strings

required

Returns:

Type Description
Dict[str, any]

dict[str, any]: a single concatenated geoJSON

Source code in src/luna/pathology/common/build_geojson.py
def concatenate_regional_geojsons(geojson_list: List[Dict[str, any]]) -> Dict[str, any]:
    """concatenate regional annotations

    Concatenates geojsons if there are more than one annotations for the labelset.

    Args:
        geojson_list (list[dict[str, any]]): list of geoJSON strings

    Returns:
        dict[str, any]: a single concatenated geoJSON
    """
    # create json from str representations
    geojson_list = [json.loads(geojson) for geojson in geojson_list]

    concat_geojson = geojson_list[0]
    if len(geojson_list) == 1:
        return concat_geojson

    # create concatenated geojson
    for json_dict in geojson_list[1:]:
        print(f"Concatenating {len(geojson_list)} geojsons")
        concat_geojson["features"].extend(json_dict["features"])

    return concat_geojson

find_parents(polygons)

determines of parent child relationships of polygons

Returns a list of size n (where n is the number of input polygons in the input list polygons) where the value at index n cooresponds to the nth polygon's parent. In the case of no parent, -1 is used. for example, parent_nums[0] = 2 means that polygon 0's parent is polygon 2

Parameters:

Name Type Description Default
polygons list

a list of shapely polygon objects

required

Returns:

Name Type Description
list list

a list of parent-child relationships for the polygon objects

Source code in src/luna/pathology/common/build_geojson.py
def find_parents(polygons: list) -> list:
    """determines of parent child relationships of polygons

    Returns a list of size n (where n is the number of input polygons in the input list
    polygons) where the value at index n cooresponds to the nth polygon's parent. In
    the case of no parent, -1 is used. for example, parent_nums[0] = 2 means that
    polygon 0's parent is polygon 2

    Args:
        polygons (list): a list of shapely polygon objects

    Returns:
        list: a list of parent-child relationships for the polygon objects

    """
    parent_nums = []
    for child in polygons:
        found_parent = False
        for parent_idx, parent in enumerate(polygons):
            if child == parent:
                continue
            # found parent for child
            if parent.contains(child):
                parent_nums.append(parent_idx)
                found_parent = True
                break
        # finished looping through all potential parents, so child is a parent
        if not found_parent:
            parent_nums.append(-1)

    print(parent_nums)

    return parent_nums

handler(signum, frame)

signal handler for geojson

Parameters:

Name Type Description Default
signum str

signal number

required
fname str

filename for which exception occurred

required

Returns:

Type Description
None

None

Source code in src/luna/pathology/common/build_geojson.py
def handler(signum: str, frame: str) -> None:
    """signal handler for geojson

    Args:
        signum (str): signal number
        fname (str): filename for which exception occurred

    Returns:
        None
    """

    raise TimeoutError("Geojson generation timed out.")

deepzoom

DeepZoomGenerator

Source code in src/luna/pathology/common/deepzoom.py
class DeepZoomGenerator:
    BOUNDS_OFFSET_PROPS = (
        tiffslide.PROPERTY_NAME_BOUNDS_X,
        tiffslide.PROPERTY_NAME_BOUNDS_Y,
    )
    BOUNDS_SIZE_PROPS = (
        tiffslide.PROPERTY_NAME_BOUNDS_WIDTH,
        tiffslide.PROPERTY_NAME_BOUNDS_HEIGHT,
    )

    def __init__(
        self,
        urlpath: Union[str, fsspec.core.OpenFile],
        tile_size: int = 254,
        overlap: int = 1,
        limit_bounds: bool = False,
        storage_options: dict = {},
    ) -> None:
        self._z_t_downsample = tile_size
        self._z_overlap = overlap
        self._limit_bounds = limit_bounds

        self._storage_options = storage_options
        if isinstance(urlpath, str):
            self._openfile = fsspec.open(urlpath, **storage_options)
        else:
            self._openfile = urlpath

        with self._openfile as f, tiffslide.TiffSlide(f) as tiff:
            if limit_bounds:
                # Level 0 coordinate offset
                self._l0_offset = tuple(
                    int(tiff.properties.get(prop, 0))
                    for prop in self.BOUNDS_OFFSET_PROPS
                )
                # Slide level dimensions scale factor in each axis
                size_scale = tuple(
                    int(tiff.properties.get(prop, l0_lim)) / l0_lim
                    for prop, l0_lim in zip(self.BOUNDS_SIZE_PROPS, tiff.dimensions)
                )
                # Dimensions of active area
                self._l_dimensions = tuple(
                    tuple(
                        int(math.ceil(l_lim * scale))
                        for l_lim, scale in zip(l_size, size_scale)
                    )
                    for l_size in tiff.level_dimensions
                )
            else:
                self._l_dimensions = tiff.level_dimensions
                self._l0_offset = (0, 0)
            self._l0_dimensions = self._l_dimensions[0]
            # Deep Zoom level
            z_size = self._l0_dimensions
            z_dimensions = [z_size]
            while z_size[0] > 1 or z_size[1] > 1:
                z_size = tuple(max(1, int(math.ceil(z / 2))) for z in z_size)
                z_dimensions.append(z_size)
            self._z_dimensions = tuple(reversed(z_dimensions))

            # Tile
            def tiles(z_lim):
                return int(math.ceil(z_lim / self._z_t_downsample))

            self._t_dimensions = tuple(
                (tiles(z_w), tiles(z_h)) for z_w, z_h in self._z_dimensions
            )

            # Deep Zoom level count
            self._dz_levels = len(self._z_dimensions)

            # Total downsamples for each Deep Zoom level
            l0_z_downsamples = tuple(
                2 ** (self._dz_levels - dz_level - 1)
                for dz_level in range(self._dz_levels)
            )

            # Preferred slide levels for each Deep Zoom level
            self._slide_from_dz_level = tuple(
                tiff.get_best_level_for_downsample(d) for d in l0_z_downsamples
            )

            # Piecewise downsamples
            self._l0_l_downsamples = tiff.level_downsamples
            self._l_z_downsamples = tuple(
                l0_z_downsamples[dz_level]
                / self._l0_l_downsamples[self._slide_from_dz_level[dz_level]]
                for dz_level in range(self._dz_levels)
            )

            # Slide background color
            bg_color = tiff.properties.get(tiffslide.PROPERTY_NAME_BACKGROUND_COLOR)
            if bg_color:
                self._bg_color = "#" + bg_color
            else:
                self._bg_color = "#ffffff"

    @property
    def level_count(self):
        """The number of Deep Zoom levels in the image."""
        return self._dz_levels

    @property
    def level_tiles(self):
        """A list of (tiles_x, tiles_y) tuples for each Deep Zoom level."""
        return self._t_dimensions

    @property
    def level_dimensions(self):
        """A list of (pixels_x, pixels_y) tuples for each Deep Zoom level."""
        return self._z_dimensions

    @property
    def tile_count(self):
        """The total number of Deep Zoom tiles in the image."""
        return sum(t_cols * t_rows for t_cols, t_rows in self._t_dimensions)

    def get_tile(self, level, address):
        """Return an RGB PIL.Image for a tile.

        level:     the Deep Zoom level.
        address:   the address of the tile within the level as a (col, row)
                   tuple."""

        # Read tile
        args, z_size = self._get_tile_info(level, address)
        with self._openfile as f, tiffslide.TiffSlide(f) as tiff:
            tile = tiff.read_region(*args)

            # Apply on solid background
            # bg = Image.new('RGB', tile.size, self._bg_color)
            # tile = Image.composite(tile, bg, tile)

            # Scale to the correct size
            if tile.size != z_size:
                # Image.Resampling added in Pillow 9.1.0
                # Image.LANCZOS removed in Pillow 10
                tile.thumbnail(z_size, getattr(Image, "Resampling", Image).LANCZOS)

            return tile

    def _get_tile_info(self, dz_level, t_location):
        # Check parameters
        if dz_level < 0 or dz_level >= self._dz_levels:
            raise ValueError("Invalid level")
        for t, t_lim in zip(t_location, self._t_dimensions[dz_level]):
            if t < 0 or t >= t_lim:
                raise ValueError(f"Invalid address: {dz_level}:{t_location}")

        # Get preferred slide level
        slide_level = self._slide_from_dz_level[dz_level]

        # Calculate top/left and bottom/right overlap
        z_overlap_tl = tuple(self._z_overlap * int(t != 0) for t in t_location)
        z_overlap_br = tuple(
            self._z_overlap * int(t != t_lim - 1)
            for t, t_lim in zip(t_location, self.level_tiles[dz_level])
        )

        # Get final size of the tile
        z_size = tuple(
            min(self._z_t_downsample, z_lim - self._z_t_downsample * t) + z_tl + z_br
            for t, z_lim, z_tl, z_br in zip(
                t_location, self._z_dimensions[dz_level], z_overlap_tl, z_overlap_br
            )
        )

        # Obtain the region coordinates
        z_location = [self._z_from_t(t) for t in t_location]
        l_location = [
            self._l_from_z(dz_level, z - z_tl)
            for z, z_tl in zip(z_location, z_overlap_tl)
        ]
        # Round location down and size up, and add offset of active area
        l0_location = tuple(
            int(self._l0_from_l(slide_level, loc) + l0_off)
            for loc, l0_off in zip(l_location, self._l0_offset)
        )
        l_size = tuple(
            int(min(math.ceil(self._l_from_z(dz_level, dz)), l_lim - math.ceil(loc)))
            for loc, dz, l_lim in zip(
                l_location, z_size, self._l_dimensions[slide_level]
            )
        )

        # Return read_region() parameters plus tile size for final scaling
        return ((l0_location, slide_level, l_size), z_size)

    def _l0_from_l(self, slide_level, loc):
        return self._l0_l_downsamples[slide_level] * loc

    def _l_from_z(self, dz_level, z):
        return self._l_z_downsamples[dz_level] * z

    def _z_from_t(self, t):
        return self._z_t_downsample * t
level_count property

The number of Deep Zoom levels in the image.

level_dimensions property

A list of (pixels_x, pixels_y) tuples for each Deep Zoom level.

level_tiles property

A list of (tiles_x, tiles_y) tuples for each Deep Zoom level.

tile_count property

The total number of Deep Zoom tiles in the image.

get_tile(level, address)

Return an RGB PIL.Image for a tile.

level: the Deep Zoom level. address: the address of the tile within the level as a (col, row) tuple.

Source code in src/luna/pathology/common/deepzoom.py
def get_tile(self, level, address):
    """Return an RGB PIL.Image for a tile.

    level:     the Deep Zoom level.
    address:   the address of the tile within the level as a (col, row)
               tuple."""

    # Read tile
    args, z_size = self._get_tile_info(level, address)
    with self._openfile as f, tiffslide.TiffSlide(f) as tiff:
        tile = tiff.read_region(*args)

        # Apply on solid background
        # bg = Image.new('RGB', tile.size, self._bg_color)
        # tile = Image.composite(tile, bg, tile)

        # Scale to the correct size
        if tile.size != z_size:
            # Image.Resampling added in Pillow 9.1.0
            # Image.LANCZOS removed in Pillow 10
            tile.thumbnail(z_size, getattr(Image, "Resampling", Image).LANCZOS)

        return tile

schemas

SlideTiles

Source code in src/luna/pathology/common/schemas.py
class SlideTiles:
    REQ_COLUMNS = set(
        ["address", "x_coord", "y_coord", "xy_extent", "tile_size", "tile_units"]
    )

    @classmethod
    def check(self, slide_tiles):
        """Returns True if the given path is readable as "SlideTiles <slide_tiles>", else, reaises SchemaMismatchError"""
        df = pd.read_parquet(slide_tiles).reset_index()

        if not set(df.columns).intersection(self.REQ_COLUMNS) == self.REQ_COLUMNS:
            raise SchemaMismatchError(
                "SlideTile failed schema check: missing columns: ",
                (set(df.columns).intersection(self.REQ_COLUMNS)).symmetric_difference(
                    self.REQ_COLUMNS
                ),
            )

        return True
check(slide_tiles) classmethod

Returns True if the given path is readable as "SlideTiles ", else, reaises SchemaMismatchError

Source code in src/luna/pathology/common/schemas.py
@classmethod
def check(self, slide_tiles):
    """Returns True if the given path is readable as "SlideTiles <slide_tiles>", else, reaises SchemaMismatchError"""
    df = pd.read_parquet(slide_tiles).reset_index()

    if not set(df.columns).intersection(self.REQ_COLUMNS) == self.REQ_COLUMNS:
        raise SchemaMismatchError(
            "SlideTile failed schema check: missing columns: ",
            (set(df.columns).intersection(self.REQ_COLUMNS)).symmetric_difference(
                self.REQ_COLUMNS
            ),
        )

    return True

slideviewer_client

Created on January 31, 2021

@author: pashaa@mskcc.org

Functions for downloading annotations from SlideViewer

download_sv_point_annotation(url)

download slideviwer point annotation

Calls slideviewer API with the given url

Parameters:

Name Type Description Default
url str

slide viewer api to call

required

Returns:

Type Description
Dict[str, any]

dict[str, any]: json response

Source code in src/luna/pathology/common/slideviewer_client.py
def download_sv_point_annotation(url: str) -> Dict[str, any]:
    """download slideviwer point annotation

    Calls slideviewer API with the given url

    Args:
        url (str): slide viewer api to call

    Returns:
        dict[str, any]: json response
    """
    try:
        response = requests.get(url)
        data = response.json()
    except Exception:
        logger.exception("General exception raised while trying " + url)
        return None

    logger.info("Found data = " + str(data))
    if str(data) != "[]":
        return data
    else:
        logger.warning("Label annotation file does not exist for slide and user.")
        return None

download_zip(url, dest_path, chunk_size=128)

Download zip file

Downloads zip from the specified URL and saves it to the specified file path. see https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url

Parameters:

Name Type Description Default
url str

slideviewer url to download zip from

required
dest_path str

file path where zipfile should be saved

required
chunk_size int

size in bytes of chunks to batch out during download

128

Returns:

Name Type Description
bool bool

True if zipfile downloaded and saved successfully, else false

Source code in src/luna/pathology/common/slideviewer_client.py
def download_zip(url: str, dest_path: str, chunk_size: int = 128) -> bool:
    """Download zip file

    Downloads zip from the specified URL and saves it to the specified file path.
    see https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url

    Args:
        url (str): slideviewer url to download zip from
        dest_path (str): file path where zipfile should be saved
        chunk_size (int): size in bytes of chunks to batch out during download

    Returns:
        bool: True if zipfile downloaded and saved successfully, else false
    """

    response = requests.get(url, stream=True)
    with open(dest_path, "wb") as fd:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk == b"Label image not found.":  # message from slideviewer
                return False
            else:
                fd.write(chunk)
        return True

fetch_slide_ids(url, project_id, dest_dir, csv_file=None)

get slide ids

Fetch the list of slide ids from the slideviewer server for the project with the specified project id. Alternately, a slideviewer csv file may be provided to override download from server.

Parameters:

Name Type Description Default
url str or None

slideviewer url. url may be None if csv_file is specified.

required
project_id int

slideviewer project id from which to fetch slide ids

required
dest_dir str

directory where csv file should be downloaded

required
csv_file str

slideviewer csv file may be provided to override the need

None

Returns:

Name Type Description
list list

list of (slideviewer_path, slide_id, sv_project_id)

Source code in src/luna/pathology/common/slideviewer_client.py
def fetch_slide_ids(
    url: str, project_id: int, dest_dir: str, csv_file: str = None
) -> list:
    """get slide ids

    Fetch the list of slide ids from the slideviewer server for the project with the
    specified project id. Alternately, a slideviewer csv file may be provided to
    override download from server.

    Args:
        url (str or None): slideviewer url. url may be None if csv_file is specified.
        project_id (int): slideviewer project id from which to fetch slide ids
        dest_dir (str): directory where csv file should be downloaded
        csv_file (str): slideviewer csv file may be provided to override the need
        to download the file

    Returns:
        list:  list of (slideviewer_path, slide_id, sv_project_id)
    """

    # run on all slides from specified SLIDEVIEWER_CSV file.
    # if file is not specified, then download file using slideviewer API
    # download entire slide set using project id
    # the file is then written to the dest directory
    new_csv_file = os.path.join(dest_dir, "project_" + str(project_id) + ".csv")

    if csv_file is None or csv_file == "" or not os.path.exists(csv_file):

        url = url + "exportProjectCSV?pid={pid}".format(pid=str(project_id))
        res = requests.get(url)

        with open(new_csv_file, "wb") as slideoutfile:
            slideoutfile.write(res.content)

    else:
        # copy given csv_file to dest directory
        shutil.copy(csv_file, new_csv_file)

    # read slide ids
    slides = []
    with open(new_csv_file) as slideoutfile:
        # skip first 4 lines
        count = 0
        for line in slideoutfile:
            count += 1
            if count == 4:
                break

        # read whole slide image file names contained in the project in slide viewer
        for line in slideoutfile:
            full_filename = line.strip()
            slidename = get_slide_id(full_filename)
            slides.append([full_filename, slidename, project_id])

    return slides

get_slide_id(full_filename)

get slide id

Get slide id from the slideviewer full file name. The full_filename in the slideview csv is of the format: year;HOBS_ID;slide_id.svs for example: 2013;HobS13-283072057510;1435197.svs

Parameters:

Name Type Description Default
full_filename str

full filename of slide

required

Returns:

Name Type Description
str str

numeric slide id

Source code in src/luna/pathology/common/slideviewer_client.py
def get_slide_id(full_filename: str) -> str:
    """get slide id

    Get slide id from the slideviewer full file name. The full_filename in
    the slideview csv is of the format: year;HOBS_ID;slide_id.svs
    for example: 2013;HobS13-283072057510;1435197.svs

    Args:
        full_filename (str): full filename of slide

    Returns:
        str: numeric slide id
    """
    return full_filename.split(";")[-1].replace(".svs", "")

unzip(zipfile_path)

unzip zip file

Parameters:

Name Type Description Default
zipfile_path str

path of zipfile to unzip

required

Returns:

Type Description
any

readfile pointer to unzippped file if successfully unzippped, else None

Source code in src/luna/pathology/common/slideviewer_client.py
def unzip(zipfile_path: str) -> any:
    """unzip zip file

    Args:
        zipfile_path (str): path of zipfile to unzip

    Returns:
        readfile pointer to unzippped file if successfully unzippped, else None
    """
    logger.info("Unzipping " + zipfile_path)
    try:
        return zipfile.ZipFile(zipfile_path)  # returns read file pointer
    except zipfile.BadZipFile:
        logger.exception("Dumping invalid Zipfile " + zipfile_path + ":")
        return None

utils

address_to_coord(s)

converts address into coordinates

Parameters:

Name Type Description Default
s str

a string consisting of an x_y_z address

required

Returns:

Type Description
Optional[Tuple[int, int]]

Tuple[int, int]: a tuple consisting of an x, y pair

Source code in src/luna/pathology/common/utils.py
def address_to_coord(s: str) -> Optional[Tuple[int, int]]:
    """converts address into coordinates

    Args:
        s (str): a string consisting of an x_y_z address

    Returns:
        Tuple[int, int]: a tuple consisting of an x, y pair
    """
    s = str(s)
    p = re.compile(r"x(\d+)_y(\d+)", re.IGNORECASE)
    m = p.match(s)
    if m:
        x = int(m.group(1))
        y = int(m.group(2))
        return (x, y)
    return None

convert_halo_xml_to_roi(xml_fn)

get roi from halo XML file

Read the rectangle ROI of a halo XML annotation file

Parameters:

Name Type Description Default
xml_fn str

file path to input halo XML file

required

Returns:

Type Description
Optional[Tuple[List, List]]

Tuple[list, list]: returns a tuple of x, y coordinates of the recangular roi

Source code in src/luna/pathology/common/utils.py
def convert_halo_xml_to_roi(xml_fn: str) -> Optional[Tuple[List, List]]:
    """get roi from halo XML file

    Read the rectangle ROI of a halo XML annotation file

    Args:
        xml_fn: file path to input halo XML file

    Returns:
        Tuple[list, list]: returns a tuple of x, y coordinates of the recangular roi

    """

    ylist = list()
    xlist = list()

    print("Converting to ROI:", xml_fn)
    e = et.parse(xml_fn).getroot()
    for ann in e.findall("Annotation"):
        regions = ann.findall("Regions")[0]
        if len(regions) == 0:
            continue

        if not regions[0].get("Type") == "Rectangle":
            continue

        for i, r in enumerate(regions):
            vs = r.findall("Vertices")[0]
            vs = vs.findall("V")
            for v in vs:
                y, x = int(v.get("Y").split(".")[0]), int(v.get("X").split(".")[0])
                ylist.append(y)
                xlist.append(x)

    if xlist == [] or ylist == []:
        logger.warning("No Rectangle found, returning None!")
        return None

    if min(xlist) < 0:
        logger.warning("Somehow a negative x rectangle coordinate!")
        xlist = [0, max(xlist)]
    if min(ylist) < 0:
        logger.warning("Somehow a negative y rectangle coordinate!")
        ylist = [0, max(ylist)]

    return xlist, ylist

convert_xml_to_mask(xml_urlpath, shape, annotation_name, storage_options={})

convert xml to bitmask

Converts a sparse halo XML annotation file (polygons) to a dense bitmask

Parameters:

Name Type Description Default
xml_urlpath str

file path to input halo XML file

required
shape list

desired polygon shape

required
annotation_name str

name of annotation

required

Returns:

Type Description
Optional[Tuple[ndarray, Dict[str, Any]]]

Optional[Tuple[np.ndarray, Dict[str, Any]]]: annotation bitmask of specified shape

Source code in src/luna/pathology/common/utils.py
def convert_xml_to_mask(
    xml_urlpath: str,
    shape: list,
    annotation_name: str,
    storage_options: dict = {},
) -> Optional[Tuple[np.ndarray, Dict[str, Any]]]:
    """convert xml to bitmask

    Converts a sparse halo XML annotation file (polygons) to a dense bitmask

    Args:
        xml_urlpath (str): file path to input halo XML file
        shape (list): desired polygon shape
        annotation_name (str): name of annotation

    Returns:
        Optional[Tuple[np.ndarray, Dict[str, Any]]]: annotation bitmask of specified shape
    """

    ret = None
    # Annotations >>
    with open(xml_urlpath, **storage_options) as of:
        e = et.parse(of).getroot()
    e = e.findall("Annotation")
    n_regions = 0
    for ann in e:
        if ann.get("Name") != annotation_name:
            continue

        logger.debug(f"Found region {ann.get('Name')}")

        board_pos = np.zeros(shape, dtype=np.uint8)
        board_neg = np.zeros(shape, dtype=np.uint8)

        regions = ann.findall("Regions")
        assert len(regions) == 1

        rs = regions[0].findall("Region")

        for i, r in enumerate(rs):
            negative_flag = int(r.get("NegativeROA"))
            assert negative_flag == 0 or negative_flag == 1
            negative_flag = bool(negative_flag)

            vs = r.findall("Vertices")[0]
            vs = vs.findall("V")
            vs.append(vs[0])  # last dot should be linked to the first dot

            plist = list()
            for v in vs:
                x, y = int(v.get("X").split(".")[0]), int(v.get("Y").split(".")[0])
                plist.append((x, y))

            if negative_flag:
                board_neg = cv2.drawContours(
                    board_neg, [np.array(plist, dtype=np.int32)], -1, [0, 0, 0], -1
                )
            else:
                board_pos = cv2.drawContours(
                    board_pos,
                    [np.array(plist, dtype=np.int32)],
                    contourIdx=-1,
                    color=[255, 0, 0],
                    thickness=-1,
                )
            n_regions += 1

        ret = (board_pos > 0) * (board_neg == 0)

    if ret.any():
        mask = ret.astype(np.uint8)

        properties = {
            "n_regions": n_regions,
            "n_positive_pixels": np.where(mask > 0, 1, 0).sum(),
        }
        return mask, properties
    return None

coord_to_address(s, magnification)

converts coordinate to address

Parameters:

Name Type Description Default
s tuple[int, int]

coordinate consisting of an (x, y) tuple

required
magnification int

magnification factor

required

Returns:

Name Type Description
str str

a string consisting of an x_y_z address

Source code in src/luna/pathology/common/utils.py
def coord_to_address(s: Tuple[int, int], magnification: Optional[int]) -> str:
    """converts coordinate to address

    Args:
        s (tuple[int, int]): coordinate consisting of an (x, y) tuple
        magnification (int): magnification factor

    Returns:
        str: a string consisting of an x_y_z address
    """

    x = s[0]
    y = s[1]
    address = f"x{x}_y{y}"
    if magnification:
        address += f"_z{magnification}"
    return address

extract_patch_texture_features(image_patch, mask_patch, stain_vectors, stain_channel, plot=False)

extact patch texture features

Runs patch-wise extraction from an image_patch, mask_patch pair given a stain vector and stain channel.

Parameters:

Name Type Description Default
image_patch ndarray

input image patch

required
mask_patch ndarray

input image mask

required
stain_vectors ndarray

stain vectors extacted from the image patch

required
stain_channel int

stain channel

required
plot (Optional, bool)

unused?

False

Returns:

Type Description
Optional[Dict[str, ndarray]]

Optional[Dict[str, np.ndarray]]: texture features from image patch

Source code in src/luna/pathology/common/utils.py
def extract_patch_texture_features(
    image_patch, mask_patch, stain_vectors, stain_channel, plot=False
) -> Optional[Dict[str, np.ndarray]]:
    """extact patch texture features

    Runs patch-wise extraction from an image_patch, mask_patch pair given a stain
    vector and stain channel.

    Args:
        image_patch (np.ndarray): input image patch
        mask_patch (np.ndarray): input image mask
        stain_vectors (np.ndarray): stain vectors extacted from the image patch
        stain_channel (int): stain channel
        plot (Optional, bool): unused?

    Returns:
        Optional[Dict[str, np.ndarray]]: texture features from image patch

    """

    # logging.getLogger("radiomics.featureextractor").setLevel(logging.WARNING)
    if not (len(np.unique(mask_patch)) > 1 and np.count_nonzero(mask_patch) > 1):
        return None

    output_dict = {}  # type: Dict[str, Any]

    stain_patch = pull_stain_channel(image_patch, stain_vectors, channel=stain_channel)

    original_pixels = stain_patch.astype(np.uint8)[
        np.where(mask_patch.astype(np.bool_))
    ].flatten()
    original_pixels_valid = original_pixels[original_pixels > 0]
    output_dict["original_pixels"] = original_pixels_valid

    extractor = radiomics.featureextractor.RadiomicsFeatureExtractor(binWidth=16)
    extractor.disableAllFeatures()
    extractor.enableImageTypeByName("Original")
    extractor.enableFeatureClassByName("glcm")
    # extractor.enableFeatureByName('original_glcm_MCC', enable=False)

    sitk_image = sitk.GetImageFromArray(stain_patch.astype(np.uint8))
    sitk_mask = sitk.GetImageFromArray(mask_patch.astype(np.uint8))

    try:
        bbox, _ = radiomics.imageoperations.checkMask(sitk_image, sitk_mask)
    except Exception as exc:
        logger.warning(f"Skipping this patch, mask pair due to '{exc}'")
        return None
    else:
        # cimg, cmas = radiomics.imageoperations.cropToTumorMask(sitk_image, sitk_mask, bbox)

        fts = extractor.execute(sitk_image, sitk_mask, voxelBased=True)

        for key in fts.keys():
            if "original_glcm" not in key:
                continue

            stainomics_patch = sitk.GetArrayFromImage(fts[key]).astype(np.float32)
            stainomics_nonzero = stainomics_patch[stainomics_patch != 0].flatten()
            stainomics_valid = stainomics_nonzero[~np.isnan(stainomics_nonzero)]

            output_dict[key] = stainomics_valid

        return output_dict

get_downscaled_thumbnail(slide, scale_factor)

get downscaled thumbnail

yields a thumbnail image of a whole slide rescaled by a specified scale factor

Parameters:

Name Type Description Default
slide TiffSlide

slide object

required
scale_factor int

integer scaling factor to resize the whole slide by

required

Returns:

Type Description
ndarray

np.ndarray: downsized whole slie thumbnail

Source code in src/luna/pathology/common/utils.py
@timed
def get_downscaled_thumbnail(
    slide: TiffSlide, scale_factor: Union[int, float]
) -> np.ndarray:
    """get downscaled thumbnail

    yields a thumbnail image of a whole slide rescaled by a specified scale factor

    Args:
        slide (TiffSlide): slide object
        scale_factor (int): integer scaling factor to resize the whole slide by

    Returns:
        np.ndarray: downsized whole slie thumbnail
    """
    new_width = slide.dimensions[0] // scale_factor
    new_height = slide.dimensions[1] // scale_factor
    img = slide.get_thumbnail((int(new_width), int(new_height)))
    return np.array(img)

get_full_resolution_generator(slide_urlpath, tile_size, storage_options={})

Return MinimalComputeAperioDZGenerator and generator level

Parameters:

Name Type Description Default
slide_urlpath str

slide urlpath

required

Returns:

Type Description
Tuple[DeepZoomGenerator, int]

Tuple[MinimalComputeAperioDZGenerator, int]

Source code in src/luna/pathology/common/utils.py
def get_full_resolution_generator(
    slide_urlpath: str,
    tile_size: int,
    storage_options: dict = {},
) -> Tuple[DeepZoomGenerator, int]:
    """Return MinimalComputeAperioDZGenerator and generator level

    Args:
        slide_urlpath (str): slide urlpath

    Returns:
        Tuple[MinimalComputeAperioDZGenerator, int]
    """
    generator = DeepZoomGenerator(
        slide_urlpath,
        overlap=0,
        tile_size=tile_size,
        limit_bounds=False,
        storage_options=storage_options,
    )

    generator_level = generator.level_count - 1
    # assert generator.level_dimensions[generator_level] == slide.dimensions
    return generator, generator_level

get_layer_names(xml_urlpath, storage_options={})

get available layer names

Finds all possible annotation layer names from a Halo generated xml ROI file

Parameters:

Name Type Description Default
xml_urlpath str

absolute or relativefile path to input halo XML file. prefix scheme to use alternative filesystems.

required

Returns:

Name Type Description
set

Available region names

Source code in src/luna/pathology/common/utils.py
def get_layer_names(xml_urlpath, storage_options={}):
    """get available layer names

    Finds all possible annotation layer names from a Halo generated xml ROI file

    Args:
        xml_urlpath (str): absolute or relativefile path to input halo XML file. prefix scheme to use alternative filesystems.

    Returns:
        set: Available region names
    """  # Annotations >>
    with open(xml_urlpath, "r", **storage_options) as of:
        e = et.parse(of).getroot()
    e = e.findall("Annotation")
    names = set()

    [names.add(ann.get("Name")) for ann in e]

    return names

get_scale_factor_at_magnification(slide, requested_magnification)

get scale factor at magnification

Return a scale factor if slide scanned magnification and requested magnification are different.

Parameters:

Name Type Description Default
slide TiffSlide

slide object

required
requested_magnification Optional[int]

requested magnification

required

Returns:

Name Type Description
int float

scale factor required to achieve requested magnification

Source code in src/luna/pathology/common/utils.py
def get_scale_factor_at_magnification(
    slide: TiffSlide, requested_magnification: Optional[int]
) -> float:
    """get scale factor at magnification

    Return a scale factor if slide scanned magnification and
    requested magnification are different.

    Args:
        slide (TiffSlide): slide object
        requested_magnification (Optional[int]): requested magnification

    Returns:
        int: scale factor required to achieve requested magnification
    """
    # First convert to float to handle true integers encoded as string floats (e.g. '20.000')
    mag_value = float(slide.properties["aperio.AppMag"])

    # Then convert to integer
    scanned_magnification = int(mag_value)

    # # Make sure we don't have non-integer magnifications
    if not int(mag_value) == mag_value:
        raise RuntimeError(
            "Can't handle slides scanned at non-integer magnficiations! (yet)"
        )

    # Verify magnification valid
    scale_factor = 1.0
    if requested_magnification and scanned_magnification != requested_magnification:
        if scanned_magnification < requested_magnification:
            raise ValueError(
                f"Expected magnification <={scanned_magnification} but got {requested_magnification}"
            )
        elif (scanned_magnification % requested_magnification) == 0:
            scale_factor = scanned_magnification // requested_magnification
        else:
            logger.warning("Scale factor is not an integer, be careful!")
            scale_factor = scanned_magnification / requested_magnification

    return scale_factor

get_stain_vectors_macenko(sample)

get_stain_vectors

Uses the staintools MacenkoStainExtractor to extract stain vectors

Parameters:

Name Type Description Default
sample ndarray

input patch

required

Returns: np.ndarray: the stain matrix

Source code in src/luna/pathology/common/utils.py
def get_stain_vectors_macenko(sample: np.ndarray) -> np.ndarray:
    """get_stain_vectors

    Uses the staintools MacenkoStainExtractor to extract stain vectors

    Args:
        sample (np.ndarray): input patch
    Returns:
        np.ndarray: the stain matrix

    """
    from staintools.stain_extraction.macenko_stain_extractor import (
        MacenkoStainExtractor,  # type: ignore
    )

    extractor = MacenkoStainExtractor()
    vectors = extractor.get_stain_matrix(sample)
    return vectors

get_tile_array(row, storage_options={})

Returns a tile image as a numpy array.

Parameters:

Name Type Description Default
row DataFrame

row with address and tile_image_file columns

required
Source code in src/luna/pathology/common/utils.py
def get_tile_array(row: pd.DataFrame, storage_options: dict = {}) -> np.ndarray:
    """
    Returns a tile image as a numpy array.

    Args:
        row (pd.DataFrame): row with address and tile_image_file columns
    """
    fs, path = fsspec.core.url_to_fs(row.tile_store, **storage_options)
    cache_fs = fsspec.filesystem("filecache", fs=fs)
    with cache_fs.open(path, "rb", **storage_options) as of:
        with h5py.File(of, "r") as hf:
            tile = np.array(hf[row.name])
            return tile

get_tile_arrays(indices, input_slide_urlpath, tile_size, storage_options={})

Get tile arrays for the tile indices

Parameters:

Name Type Description Default
indices List[int]

list of integers to return as tiles

required
input_slide_image str

path to WSI

required
tile_size int

width, height of generated tile

required

Returns:

Type Description
List[Tuple[int, ndarray]]

a list of tuples (index, tile array) for given indices

Source code in src/luna/pathology/common/utils.py
def get_tile_arrays(
    indices: List[int],
    input_slide_urlpath: str,
    tile_size: int,
    storage_options: dict = {},
) -> List[Tuple[int, np.ndarray]]:
    """
    Get tile arrays for the tile indices

    Args:
        indices (List[int]): list of integers to return as tiles
        input_slide_image (str): path to WSI
        tile_size (int): width, height of generated tile

    Returns:
        a list of tuples (index, tile array) for given indices
    """
    full_generator, full_level = get_full_resolution_generator(
        input_slide_urlpath, tile_size=tile_size, storage_options=storage_options
    )
    return [
        (
            index,
            np.array(
                full_generator.get_tile(
                    full_level, address_to_coord(str(index))
                ).resize((tile_size, tile_size))
            ),
        )
        for index in indices
    ]

get_tile_color(score)

get tile color

uses deafult color palette to return color of tile based on score

Parameters:

Name Type Description Default
score Union[str, float]

a value between [0,1] such as the Otsu threshold, puple score, a model output, etc.

required

Returns: Union[float, None]: returns the color is the input is of valid type else None

Source code in src/luna/pathology/common/utils.py
def get_tile_color(score: Union[str, float]) -> Optional[npt.ArrayLike]:
    """get tile color

    uses deafult color palette to return color of tile based on score

    Args:
        score (Union[str, float]): a value between [0,1] such as the
            Otsu threshold, puple score, a model output, etc.
    Returns:
        Union[float, None]: returns the color is the input is of valid type
            else None

    """
    # categorical
    if isinstance(score, str):
        if score in categorical_colors:
            return categorical_colors[score]
        else:
            tile_color = 255 * np.array(categorial[len(categorical_colors.keys())])
            categorical_colors[score] = tile_color
            return tile_color

    # float, expected to be value from [0,1]
    elif isinstance(score, float) and score <= 1.0 and score >= 0.0:
        tile_color = np.array([int(255 * i) for i in palette(score)[:3]])
        return tile_color

    else:
        print("Invalid Score Type")
        return None

pull_stain_channel(patch, vectors, channel=None)

pull stain channel

adds 'stain channel' to the image patch

Parameters:

Name Type Description Default
patch ndarray

input image patch

required
vectors ndarray

stain vectors

required
channel int

stain channel

None

Returns:

Type Description
ndarray

np.ndarray: the input image patch with an added stain channel

Source code in src/luna/pathology/common/utils.py
def pull_stain_channel(
    patch: np.ndarray, vectors: np.ndarray, channel: Optional[int] = None
) -> np.ndarray:
    """pull stain channel

    adds 'stain channel' to the image patch

    Args:
        patch (np.ndarray): input image patch
        vectors (np.ndarray): stain vectors
        channel (int): stain channel

    Returns:
        np.ndarray: the input image patch with an added stain channel
    """

    from staintools.miscellaneous.get_concentrations import (
        get_concentrations,  # type: ignore
    )

    tile_concentrations = get_concentrations(patch, vectors)
    identity = np.array([[1, 0, 0], [0, 1, 0]])
    tmp = 255 * (1 - np.exp(-1 * np.dot(tile_concentrations, identity)))
    tmp = tmp.reshape(patch.shape).astype(np.uint8)
    if channel is not None:
        return tmp[:, :, channel]
    else:
        return tmp

visualize_tiling_scores(df, thumbnail_img, scale_factor, score_type_to_visualize, normalize=True)

visualize tile scores

draws colored boxes around tiles to indicate the value of the score

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
thumbnail_img ndarray

input tile

required
tile_size int

tile width/length

required
score_type_to_visualize str

column name from data frame

required

Returns:

Type Description
ndarray

np.ndarray: new thumbnail image with boxes around tiles passing indicating the

ndarray

value of the score

Source code in src/luna/pathology/common/utils.py
def visualize_tiling_scores(
    df: pd.DataFrame,
    thumbnail_img: np.ndarray,
    scale_factor: float,
    score_type_to_visualize: str,
    normalize=True,
) -> np.ndarray:
    """visualize tile scores

    draws colored boxes around tiles to indicate the value of the score

    Args:
        df (pd.DataFrame): input dataframe
        thumbnail_img (np.ndarray): input tile
        tile_size (int): tile width/length
        score_type_to_visualize (str): column name from data frame

    Returns:
        np.ndarray: new thumbnail image with boxes around tiles passing indicating the
        value of the score
    """

    assert isinstance(thumbnail_img, np.ndarray)

    if normalize and df[score_type_to_visualize].dtype.kind in "biuf":
        df[score_type_to_visualize] = (
            df[score_type_to_visualize] - np.min(df[score_type_to_visualize])
        ) / np.ptp(df[score_type_to_visualize])

    for _, row in tqdm(df.iterrows(), total=len(df)):
        if "regional_label" in row and pd.isna(row.regional_label):
            continue

        start = (
            row.y_coord / scale_factor,
            row.x_coord / scale_factor,
        )  # flip because OpenSlide uses (column, row), but skimage, uses (row, column)

        rr, cc = rectangle_perimeter(
            start=start,
            extent=(row.xy_extent / scale_factor, row.xy_extent / scale_factor),
            shape=thumbnail_img.shape,
        )

        # set color based on intensity of value instead of black border (1)
        score = row[score_type_to_visualize]

        thumbnail_img[rr, cc] = get_tile_color(score)

    return thumbnail_img

dsa

dsa_api_handler

copy_item(gc, item_id, destination_id)

Copies the item to the destination.

Parameters:

Name Type Description Default
gc

girder_client

required
item_id string

uuid of the item to be copied

required
destination_id string

uuid of the destination folder

required
Source code in src/luna/pathology/dsa/dsa_api_handler.py
def copy_item(gc, item_id: str, destination_id: str):
    """
    Copies the item to the destination.

    Args:
        gc: girder_client
        item_id (string): uuid of the item to be copied
        destination_id (string): uuid of the destination folder
    """
    request_url = f"item/{item_id}/copy?folderId={destination_id}"
    try:
        gc.post(request_url)
    except Exception as err:
        logger.error(f"Error copying item: {err}")
        raise RuntimeError("Can not copy item")

create_collection(gc, collection_name)

Creates a dsa collection and returns a collection uuid from the created collection on successful creation.

Parameters:

Name Type Description Default
gc

girder client

required
collection_name string

name of the collection

required

Returns:

Name Type Description
string Optional[str]

DSA collection uuid. Or an error in the post request.

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def create_collection(gc, collection_name: str) -> Optional[str]:
    """
    Creates a dsa collection and returns a collection uuid from the created
    collection on successful creation.

    Args:
        gc: girder client
        collection_name (string): name of the collection

    Returns:
        string: DSA collection uuid. Or an error in the post request.
    """
    try:
        gc.createCollection(collection_name)
        logger.debug(f"Created collection {collection_name}")
        new_collection_id = get_collection_uuid(gc, collection_name)
        logger.debug(f"Collection {collection_name} has id {new_collection_id}")
    except Exception as err:
        logger.error(f"Couldn't create collection {collection_name} : {err}")
        return None

    return new_collection_id

create_folder(gc, folder_name, parent_type, parent_id)

Creates a dsa folder and returns a folder uuid from the created folder on successful creation.

Parameters:

Name Type Description Default
gc

girder client

required
folder_name string

name of the folder in DSA

required
parent_type string

type of the parent container (ie. folder, collection)

required
parent_id string

uuid of the parent container

required

Returns:

Name Type Description
string Optional[str]

DSA folder uuid. Or an error in the post request.

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def create_folder(
    gc, folder_name: str, parent_type: str, parent_id: str
) -> Optional[str]:
    """
    Creates a dsa folder and returns a folder uuid from the created
    folder on successful creation.

    Args:
        gc: girder client
        folder_name (string): name of the folder in DSA
        parent_type (string): type of the parent container (ie. folder, collection)
        parent_id (string): uuid of the parent container

    Returns:
        string: DSA folder uuid. Or an error in the post request.
    """
    try:
        gc.createFolder(parent_id, folder_name, parentType=parent_type)
        logger.debug(f"Created folder {folder_name}")
        new_folder_uuid = get_folder_uuid(gc, folder_name, parent_type, parent_id)
        logger.debug(f"Folder {folder_name} has id {new_folder_uuid}")
    except Exception as err:
        logger.error(f"Couldn't create folder {folder_name} : {err}")
        return None

    return new_folder_uuid

create_s3_assetstore(gc, name, bucket, access, secret, service)

Creates a s3 assetstore.

Parameters:

Name Type Description Default
gc

girder client

required
bucket string

name of the folder in DSA

required
access string

s3 access ID

required
secret string

s3 password

required
service string)

url of the s3 host

required

Returns:

Name Type Description
string Optional[str]

DSA assetstore uuid. Or an error in the post request.

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def create_s3_assetstore(
    gc, name: str, bucket: str, access: str, secret: str, service: str
) -> Optional[str]:
    """
    Creates a s3 assetstore.

    Args:
        gc: girder client
        bucket (string): name of the folder in DSA
        access (string): s3 access ID
        secret (string): s3 password
        service (string) : url of the s3 host

    Returns:
        string: DSA assetstore uuid. Or an error in the post request.
    """
    request_url = (
        f"assetstore?name={name}&type=2&bucket={bucket}&accessKeyId={access}"
        + f"&secret={secret}&service={service}"
    )
    try:
        gc.post(request_url)
        logger.debug(f"Created assetstore {name}")
        new_assetstore_uuid = get_assetstore_uuid(gc, name)
        logger.debug(f"Assetstore {name} has id {new_assetstore_uuid}")
    except Exception as err:
        logger.error(f"Couldn't create assetstore {name}: {err}")
        raise RuntimeError("Unable to create s3 assetstore")

    return new_assetstore_uuid

dsa_authenticate(gc, username, password)

Authenticate girder client

Parameters:

Name Type Description Default
gc

girder client

required
username str

DSA username

required
password str

DSA password

required
Source code in src/luna/pathology/dsa/dsa_api_handler.py
def dsa_authenticate(gc, username, password):
    """Authenticate girder client

    Args:
        gc: girder client
        username (str): DSA username
        password (str): DSA password
    """
    # Initial connnection
    try:
        gc.authenticate(username, password)
        logger.info(f"Connected to DSA @ {gc.urlBase}")
    except girder_client.AuthenticationError:
        logger.exception("Couldn't authenticate DSA due to AuthenticationError")
        raise RuntimeError("Connection to DSA endpoint failed.")
    except Exception:
        logger.exception("Couldn't authenticate DSA due to some other exception")
        raise RuntimeError("Connection to DSA endpoint failed.")

get_annotation_df(gc, annotation_uuid)

Return annotation metadata (regions) for a given annotation as a dataframe

Parameters:

Name Type Description Default
gc

girder client

required
annotation_uuid str

DSA annotation uuid

required

Returns: pd.DataFrame: annotation/region metadata, with slide_item_uuid as additional indicies

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_annotation_df(gc, annotation_uuid):
    """Return annotation metadata (regions) for a given annotation as a dataframe

    Args:
        gc: girder client
        annotation_uuid (str): DSA annotation uuid
    Returns:
        pd.DataFrame: annotation/region metadata, with slide_item_uuid as additional indicies
    """
    # Here we get all the annotation data as a json document
    annot = gc.get(f"annotation/{annotation_uuid}")
    (
        df_summary,
        df_regions,
    ) = histomicstk.annotations_and_masks.annotation_and_mask_utils.parse_slide_annotations_into_tables(
        [annot]
    )

    # Lets process the coordiates a bit...
    df_regions["x_coords"] = [
        [int(x) for x in coords_x.split(",")] for coords_x in df_regions["coords_x"]
    ]
    df_regions["y_coords"] = [
        [int(x) for x in coords_x.split(",")] for coords_x in df_regions["coords_y"]
    ]
    df_regions = df_regions.drop(columns=["coords_x", "coords_y"])

    # And join the summary data with the regional data
    df_annotations = (
        df_summary.set_index("annotation_girder_id")
        .join(df_regions.set_index("annotation_girder_id"))
        .reset_index()
    )

    df_annotations = df_annotations.rename(columns={"itemId": "slide_item_uuid"})

    return df_annotations

get_assetstore_uuid(gc, assetstore_name)

Returns the DSA assetstore uuid from the provided assetstore_name

Parameters:

Name Type Description Default
gc

girder client

required
assetstore_name string

name of the assetstore in DSA

required

Returns:

Name Type Description
string Optional[str]

DSA assetstore uuid. None if nothing matches the assetstore name or an error in the get request

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_assetstore_uuid(gc, assetstore_name: str) -> Optional[str]:
    """Returns the DSA assetstore uuid from the provided `assetstore_name`

    Args:
        gc: girder client
        assetstore_name (string): name of the assetstore in DSA

    Returns:
        string: DSA assetstore uuid. None if nothing matches the assetstore name or an
                error in the get request
    """
    try:
        df_assetstores = pd.DataFrame(gc.get("assetstore?"))
        if len(df_assetstores):
            df_assetstores = df_assetstores.set_index("_id")
            df_assetstores = df_assetstores.query(f"name=='{assetstore_name}'")
        logger.debug(f"Found assetstores {df_assetstores}")
    except Exception as err:
        logger.error(f"Couldn't retrieve data from DSA: {err}")
        raise RuntimeError("Connection to DSA endpoint failed.")

    if len(df_assetstores) == 0:
        logger.debug(f"No matching assetstore '{assetstore_name}'")
        return None

    assetstore_uuid = df_assetstores.index.item()

    logger.info(
        f"Found assetstore id={assetstore_uuid} for assetstore={assetstore_name}"
    )

    return assetstore_uuid

get_collection_metadata(collection_name, gc)

A function used to get the stylehseet associated with a DSA collection. The stylesheet can store the labels used in the annotation process

Parameters:

Name Type Description Default
collection_name str

name of DSA collection used to store the slides

required
gc

girder client

required

Returns: Optional[Tuple[str, Dict[str, any]]]: a tuple consisting of the collection uuid and thei stylesheet in JSON format or None if no stylesheet is associated with the provided collection

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_collection_metadata(
    collection_name: str, gc
) -> Optional[Tuple[str, Dict[str, any]]]:
    """A function used to get the stylehseet associated with a DSA collection. The stylesheet
    can store the labels used in the annotation process

    Args:
        collection_name (str): name of DSA collection used to store the slides
        gc: girder client
    Returns:
        Optional[Tuple[str, Dict[str, any]]]: a tuple consisting of the collection uuid
            and thei stylesheet in JSON format or None if no stylesheet is associated
            with the provided collection
    """

    collection_uuid = get_collection_uuid(gc, collection_name)

    if collection_uuid is not None:
        logger.debug("retreived collection uuid")

        # try get request from girder
        try:
            collection_response = gc.get(f"/collection/{collection_uuid}")
        except requests.exceptions.HTTPError as err:
            logger.error(
                f"Error in collection get request: {err.response.status_code}, {err.response.text}"
            )
            return None

        # if response successful, attempt to get stylehseet
        try:
            metadata_stylesheet = collection_response["meta"]["stylesheet"]
        except KeyError:
            logger.error(f"No stylesheet in collection: {collection_uuid}")
            metadata_stylesheet = None
    else:
        logger.warning(f"Invalid collection name: {collection_name}")
        return None

    return (collection_uuid, metadata_stylesheet)

get_collection_uuid(gc, collection_name)

Returns the DSA collection uuid from the provided collection_name

Parameters:

Name Type Description Default
gc

girder client

required
collection_name string

name of the collection in DSA

required

Returns:

Name Type Description
string Optional[str]

DSA collection uuid. None if nothing matches the collection name or an error in the get request

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_collection_uuid(gc, collection_name: str) -> Optional[str]:
    """Returns the DSA collection uuid from the provided `collection_name`

    Args:
        gc: girder client
        collection_name (string): name of the collection in DSA

    Returns:
        string: DSA collection uuid. None if nothing matches the collection name or an
                error in the get request
    """
    try:
        df_collections = pd.DataFrame(gc.listCollection())
        if len(df_collections):
            df_collections = df_collections.set_index("_id")
            df_collections = df_collections.query(f"name=='{collection_name}'")
        logger.debug(f"Found collections {df_collections}")
    except Exception as err:
        logger.error(f"Couldn't retrieve data from DSA: {err}")
        raise RuntimeError("Connection to DSA endpoint failed.")

    # Look for a collection called our collection name
    if len(df_collections) == 0:
        logger.debug(f"No matching collection '{collection_name}'")
        return None

    collection_uuid = df_collections.index.item()

    logger.info(
        f"Found collection id={collection_uuid} for collection={collection_name}"
    )

    return collection_uuid

get_folder_uuid(gc, folder_name, parent_type, parent_id)

Returns the DSA folder uuid from the provided folder_name

Parameters:

Name Type Description Default
gc

girder client

required
folder_name string

name of the folder in DSA

required
parent_type string

type of the parent container (ie. folder, collection)

required
parent_id string

uuid of the parent container

required

Returns:

Name Type Description
string Optional[str]

DSA folder uuid. None if nothing matches the collection name or an error in the get request

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_folder_uuid(
    gc, folder_name: str, parent_type: str, parent_id: str
) -> Optional[str]:
    """Returns the DSA folder uuid from the provided `folder_name`

    Args:
        gc: girder client
        folder_name (string): name of the folder in DSA
        parent_type (string): type of the parent container (ie. folder, collection)
        parent_id (string): uuid of the parent container

    Returns:
        string: DSA folder uuid. None if nothing matches the collection name or an
                error in the get request
    """
    try:
        df_folders = pd.DataFrame(gc.listFolder(parent_id, parent_type))
        if len(df_folders):
            df_folders = df_folders.set_index("_id")
            df_folders = df_folders.query(f"name=='{folder_name}'")
        logger.debug(f"Found folders {df_folders}")
    except Exception as err:
        logger.error(f"Couldn't retrieve data from DSA: {err}")
        raise RuntimeError("Connection to DSA endpoint failed.")

    if len(df_folders) == 0:
        logger.debug(f"No matching folders '{folder_name}'")
        return None

    folder_uuid = df_folders.index.item()

    logger.info(f"Found folder id={folder_uuid} for folder={folder_name}")

    return folder_uuid

get_item_uuid(gc, image_name, collection_name)

Returns the DSA item uuid from the provided image_name

Parameters:

Name Type Description Default
image_name string

name of the image in DSA e.g. 123.svs

required
collection_name str

name of DSA collection

required
gc

girder client

required

Returns:

Name Type Description
string Optional[str]

DSA item uuid. None if nothing matches the collection/image name.

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_item_uuid(gc, image_name: str, collection_name: str) -> Optional[str]:
    """Returns the DSA item uuid from the provided `image_name`

    Args:
        image_name (string): name of the image in DSA e.g. 123.svs
        collection_name (str): name of DSA collection
        gc: girder client

    Returns:
        string: DSA item uuid. None if nothing matches the collection/image name.
    """

    collection_uuid = get_collection_uuid(gc, collection_name)
    if not collection_uuid:
        return None

    image_id = Path(image_name).stem

    try:
        uuid_response = gc.get(f'/item?text="{image_id}"')

    except requests.exceptions.HTTPError as err:
        logger.error(
            f"Error in item get request: {err.response.status_code}, {err.response.text}"
        )
        return None

    if uuid_response is not None and len(uuid_response) > 0:
        # multiple entries can come up based on substring matches, return the correct item id by checking name field in dictionary.
        for uuid_response_dict in uuid_response:
            if "name" in uuid_response_dict and "_id" in uuid_response_dict:
                if (
                    uuid_response_dict["name"] == image_name
                    and uuid_response_dict["baseParentId"] == collection_uuid
                ):
                    dsa_uuid = uuid_response_dict["_id"]
                    logger.debug(f"Image file {image_name} found with id: {dsa_uuid}")
                    return dsa_uuid
    logger.warning(f"Image file {image_name} not found")
    return None

get_item_uuid_by_folder(gc, image_name, folder_uuid)

Returns the DSA item uuid from the provided folder

Parameters:

Name Type Description Default
gc

girder client

required
image_name string

name of the image in DSA e.g. 123.svs

required
folder_uuid string

uuid of parent DSA folder

required

Returns:

Name Type Description
string Optional[str]

DSA item uuid. None if nothing matches the folder uuid / image name.

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_item_uuid_by_folder(gc, image_name: str, folder_uuid: str) -> Optional[str]:
    """Returns the DSA item uuid from the provided folder

    Args:
        gc: girder client
        image_name (string): name of the image in DSA e.g. 123.svs
        folder_uuid (string): uuid of parent DSA folder

    Returns:
        string: DSA item uuid. None if nothing matches the folder uuid / image name.
    """
    image_id = Path(image_name).stem
    try:
        uuid_response = gc.get(f'/item?text="{image_id}"')

    except requests.exceptions.HTTPError as err:
        logger.error(
            f"Error in item get request: {err.response.status_code}, {err.response.text}"
        )
        return None

    if uuid_response is not None and len(uuid_response) > 0:
        # multiple entries can come up based on substring matches, return the correct item id by checking name field in dictionary.
        for uuid_response_dict in uuid_response:
            if "name" in uuid_response_dict and "_id" in uuid_response_dict:
                if (
                    uuid_response_dict["name"] == image_name
                    and uuid_response_dict["folderId"] == folder_uuid
                ):
                    dsa_uuid = uuid_response_dict["_id"]
                    logger.debug(f"Image file {image_name} found with id: {dsa_uuid}")
                    return dsa_uuid
    logger.warning(f"Image file {image_name} not found")
    return None

get_slide_annotation(slide_id, annotation_name, collection_name, gc)

A helper function that pulls json annotations along with metadata for a particular slide from DSA. Used for both point and regional annotation types.

Parameters:

Name Type Description Default
slide_id str

image name of WSI on DSA.

required
annotation_name str

name of annotation, or label, created on DSA

required
collection_name str

name of DSA collection the WSI belongs to

required
gc

girder client

required

Returns:

Type Description
Optional[Tuple[str, Dict[str, any], Dict[str, any]]]

Optional[Tuple[str, dict[str, any], dict[str, any]. A tuple consisting of the slide id, a json formatted annotation from slideviweer and slide metadata or None if the annotation can't be found (ie if image_id, annotation_name or collection_name are mis-specified)

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_slide_annotation(
    slide_id: str,
    annotation_name: str,
    collection_name: str,
    gc,
) -> Optional[Tuple[str, Dict[str, any], Dict[str, any]]]:
    """A helper function that pulls json annotations along with
    metadata for a particular slide from DSA. Used for both point and regional
    annotation types.

    Args:
        slide_id (str): image name of WSI on DSA.
        annotation_name (str): name of annotation, or label, created on DSA
        collection_name (str): name of DSA collection the WSI belongs to
        gc: girder client

    Returns:
        Optional[Tuple[str, dict[str, any], dict[str, any]. A tuple consisting of the slide id,
            a json formatted annotation from slideviweer and slide metadata or None if the
            annotation can't be found (ie if image_id, annotation_name or collection_name are
            mis-specified)
    """

    item_uuid = get_item_uuid(gc, slide_id, collection_name)

    if not item_uuid:
        logger.info(f"Slide {slide_id} not found in {collection_name}")
        return None

    # search for annotation

    logger.debug("Starting request for annotation")
    try:
        annotation_response = gc.get(
            f"/annotation?itemId={item_uuid}&name={annotation_name}"
        )

    except Exception as err:
        logger.error(f"Error in annotation get request: {err}")
        return None

    # get annotation json from response
    if annotation_response:
        annotation_response = annotation_response[0]
        annotation = annotation_response["annotation"]
    else:
        logger.info(f"No annotation found for slide {slide_id}")
        return None

    # get additional slide-level metadata from response
    date_created = annotation_response["created"]
    date_updated = annotation_response["updated"]

    annotation_id = annotation_response["_id"]
    creator_id = annotation_response["creatorId"]
    creator_updated_id = annotation_response["updatedId"]
    annotation_name = annotation["name"]

    try:
        creator_response = gc.get(f"/user/{creator_id}")
        creator_updated_response = gc.get(f"/user/{creator_updated_id}")
    except requests.exceptions.HTTPError as err:
        logger.error(
            f"Error in user get request: {err.response.status_code}, {err.response.text}"
        )
        return None

    creator_login = creator_response["login"]
    creator_login_updated = creator_updated_response["login"]

    slide_metadata = {
        "annotation_id": annotation_id,
        "annotation_name": annotation_name,
        "date": date_created,
        "date_updated": date_updated,
        "user": creator_login,
        "user_updated": creator_login_updated,
    }

    return (slide_id, slide_metadata, json.dumps(annotation))

get_slide_df(gc, collection_uuid)

Return slide metadata (largeImage items) for a given colleciton as a dataframe

Parameters:

Name Type Description Default
gc

girder client

required
collection_uuid str

DSA collection uuid

required

Returns: pd.DataFrame: slide metadata, with slide_id and slide_item_uuid as additional indicies

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def get_slide_df(gc, collection_uuid: str) -> pd.DataFrame:
    """Return slide metadata (largeImage items) for a given colleciton as a dataframe

    Args:
        gc: girder client
        collection_uuid (str): DSA collection uuid
    Returns:
        pd.DataFrame: slide metadata, with slide_id and slide_item_uuid as additional indicies
    """

    try:
        resource_response = gc.listResource(
            f"resource/{collection_uuid}/items", {"type": "collection"}
        )
    except Exception:
        logger.error(
            f"Couldn't retrieve resource data from DSA for {collection_uuid}, perhaps the collection UUID does not exist?"
        )
        raise RuntimeError("Retriving slide data from DSA failed.")

    df_slide_items = pd.DataFrame(resource_response).dropna(
        subset=["largeImage"]
    )  # Get largeImage types from collection items

    # Fill additional metadata based on convention (slide_id)
    df_slide_items["slide_id"] = df_slide_items["name"].apply(
        lambda x: Path(x).stem
    )  # The stem
    df_slide_items["slide_item_uuid"] = df_slide_items["_id"]

    logger.info(f"Found {len(df_slide_items)} slides!")

    return df_slide_items

import_assetstore_to_folder(gc, assetstore_uuid, destination_uuid)

Imports the assetstore to the specified destination folder.

Parameters:

Name Type Description Default
gc

girder client

required
assetstore_uuid string

uuid of the assetstore

required
destination_uuid string

uuid of the destination folder

required

Returns:

Type Description
Optional[str]

None, raises error if post request fails

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def import_assetstore_to_folder(
    gc, assetstore_uuid: str, destination_uuid: str
) -> Optional[str]:
    """
    Imports the assetstore to the specified destination folder.

    Args:
        gc: girder client
        assetstore_uuid (string): uuid of the assetstore
        destination_uuid (string): uuid of the destination folder

    Returns:
        None, raises error if post request fails
    """
    request_url = f"assetstore/{assetstore_uuid}/import"
    params = {
        "destinationId": destination_uuid,
        "destinationType": "folder",
        "importPath": "/",
    }
    try:
        gc.post(request_url, parameters=params)
        logger.debug(
            f"Importing from assetstore id {assetstore_uuid}"
            + f"to destination id {destination_uuid}"
        )
    except Exception as err:
        logger.error(f"Couldn't import assetstore id {assetstore_uuid} : {err}")
        raise RuntimeError("Unable to import assetstore to collection")

push_annotation_to_dsa_image(item_uuid, annotation_file_urlpath, uri, gc, storage_options={})

Pushes annotation to DSA, adding given item_uuid (slide-specific id)

Parameters:

Name Type Description Default
item_uuid str

DSA item uuid to be tied to the annotation

required
dsa_annotation_json Dict[str, any]

annotation JSON in DSA compatable format

required
uri str

DSA scheme://host:port e.g. http://localhost:8080

required
gc GirderClient

girder client

required

Returns:

Name Type Description
int

0 for successful upload, 1 otherwise

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def push_annotation_to_dsa_image(
    item_uuid: str,
    annotation_file_urlpath: str,
    uri: str,
    gc: girder_client.GirderClient,
    storage_options: dict = {},
):
    """Pushes annotation to DSA, adding given item_uuid (slide-specific id)

    Args:
        item_uuid (str): DSA item uuid to be tied to the annotation
        dsa_annotation_json (Dict[str, any]): annotation JSON in DSA compatable format
        uri (str): DSA scheme://host:port e.g. http://localhost:8080
        gc: girder client

    Returns:
        int: 0 for successful upload, 1 otherwise
    """

    annotation_name = Path(annotation_file_urlpath).name

    start = time.time()

    # always post a new annotation.
    # updating or deleting an existing annotation for a large annotation
    # document results in timeout.
    try:
        fs, path = fsspec.core.url_to_fs(annotation_file_urlpath, **storage_options)
        size = fs.size(path)
        reference = {
            "identifier": f"{Path(path).stem}-AnnotationFile",
            "itemId": item_uuid,
        }
        with fs.open(path) as of:
            gc.uploadFile(
                item_uuid,
                of,
                annotation_name,
                size,
                reference=orjson.dumps(reference).decode(),
            )

    except requests.exceptions.HTTPError as err:
        raise RuntimeError(
            f"Error in annotation upload: {err.response.status_code}, "
            + err.response.text
        )

    # Wait for annotation to be processed
    annotation_id = check_annotation_exists_with_retry(
        gc, item_uuid, annotation_name, retry_count=100, delay=20
    )
    if annotation_id:
        logger.info(f"Annotation successfully pushed to DSA as {annotation_id}.")
    else:
        logger.info("Annotation pushed to DSA but still processing.")
    logger.info(f"Time to push annotation {time.time() - start}")
    logger.info(f"{uri}/histomics#?image={item_uuid}")
    return annotation_id

system_check(gc)

Check DSA connection with the girder client

Parameters:

Name Type Description Default
gc

girder client

required

Returns: int: 0 for successful connection, 1 otherwise

Source code in src/luna/pathology/dsa/dsa_api_handler.py
def system_check(gc):
    """Check DSA connection with the girder client

    Args:
        gc: girder client
    Returns:
        int: 0 for successful connection, 1 otherwise
    """

    try:
        _ = gc.get("/system/check")

    except requests.exceptions.HTTPError as err:
        logger.error("Please check your host or credentials")
        logger.error(err)
        return 1

    logger.info("Successfully connected to DSA")

    return 0

utils

get_color(name, line_colors={}, fill_colors={}, alpha=100)

Get colors for cells/regions based on discrete categories.

Parameters:

Name Type Description Default
name string

feature name e.g. Stroma, Tumor

required
line_colors dict

line color map with {feature name:rgb values}

{}
fill_colors dict

fill color map with {feature name:rgba values}

{}
alpha int

alpha value for the fill color. 100 by default

100

Returns:

Name Type Description
string

RGBA values for line and fill colors

Source code in src/luna/pathology/dsa/utils.py
def get_color(name, line_colors={}, fill_colors={}, alpha=100):
    """Get colors for cells/regions based on discrete categories.

    Args:
        name (string): feature name e.g. Stroma, Tumor
        line_colors (dict, optional): line color map with {feature name:rgb values}
        fill_colors (dict, optional): fill color map with {feature name:rgba values}
        alpha (int, optional): alpha value for the fill color. 100 by default

    Returns:
        string: RGBA values for line and fill colors
    """
    if name not in line_colors and name not in fill_colors:
        r = randint(0, 255)
        g = randint(0, 255)
        b = randint(0, 255)
        fill_colors[name] = "rgba({}, {}, {}, {})".format(r, g, b, alpha)
        line_colors[name] = "rgb({}, {}, {})".format(r, g, b)
    return line_colors[name], fill_colors[name]

get_continuous_color(value, outline_color='same_as_fill', alpha=100)

Get RGBA line and fill colors for value.

Use color palette viridis to set a fill value - the color ranges from purple to yellow, for the values from 0 to 1. This function is used in generating a heatmap.

Parameters:

Name Type Description Default
value float

continuous value in [0,1]

required
outline_color string

manages the color used to outline the border of the annotation. by default, uses the same color as fill_color.

'same_as_fill'
alpha int

alpha value for the fill color. 100 by default

100

Returns:

Name Type Description
string Tuple[str, str]

RGBA line and fill colors

Source code in src/luna/pathology/dsa/utils.py
def get_continuous_color(
    value, outline_color="same_as_fill", alpha=100
) -> Tuple[str, str]:
    """Get RGBA line and fill colors for value.

    Use color palette `viridis` to set a fill value - the color ranges from purple to yellow,
     for the values from 0 to 1. This function is used in generating a heatmap.

    Args:
        value (float): continuous value in [0,1]
        outline_color (string, optional): manages the color used to outline the border of the annotation.
            by default, uses the same color as fill_color.
        alpha (int, optional): alpha value for the fill color. 100 by default

    Returns:
        string: RGBA line and fill colors
    """
    c = sns.color_palette("viridis", as_cmap=True)
    r, g, b, a = c(value, bytes=True)

    fill_color = "rgba({}, {}, {}, {})".format(r, g, b, alpha)
    if outline_color == "same_as_fill":
        line_color = "rgb({}, {}, {})".format(r, g, b)
    elif outline_color == "black":
        line_color = "rgb({}, {}, {})".format(0, 0, 0)
    elif outline_color == "white":
        line_color = "rgb({}, {}, {})".format(255, 255, 255)
    else:
        return None, None
    return line_color, fill_color

vectorize_np_array_bitmask_by_pixel_value(bitmask_np, label_num=255, polygon_tolerance=1, contour_level=0.5, scale_factor=1)

Get simplified contours from the bitmask

Parameters:

Name Type Description Default
bitmask_np array

a numpy bitmask

required
label_num int

numeric value to filter the numpy array

255
polygon_tolerance float

Maximum distance from original points of polygon to approximated polygonal chain. If tolerance is 0, the original coordinate array is returned.

1
contour_level float

Value along which to find contours in the array. 0.5 by default

0.5
scale_factor int

scale to match image. default 1

1

Returns:

Name Type Description
list

simplified approximated contours

Source code in src/luna/pathology/dsa/utils.py
def vectorize_np_array_bitmask_by_pixel_value(
    bitmask_np, label_num=255, polygon_tolerance=1, contour_level=0.5, scale_factor=1
):
    """Get simplified contours from the bitmask

    Args:
        bitmask_np (np.array): a numpy bitmask
        label_num (int, optional): numeric value to filter the numpy array
        polygon_tolerance (float, optional): Maximum distance from original points of polygon
            to approximated polygonal chain. If tolerance is 0, the original coordinate array is returned.
        contour_level (float, optional): Value along which to find contours in the array.
            0.5 by default
        scale_factor (int, optional): scale to match image. default 1

    Returns:
        list: simplified approximated contours
    """
    if not scale_factor:
        scale_factor = 1
    mask = np.where(bitmask_np == label_num, 1, 0).astype(np.int8)
    contours = measure.find_contours(mask, level=contour_level)
    simplified_contours = [
        measure.approximate_polygon(c, tolerance=polygon_tolerance) for c in contours
    ]
    for _, contour in enumerate(simplified_contours):
        for coord in contour:
            x = int(round(coord[0]))
            y = int(round(coord[1]))
            # switch coordinates, otherwise gets flipped
            coord[0] = y * scale_factor
            coord[1] = x * scale_factor

    return simplified_contours

slideviewer

regional_annotation

dask_generate

cli(data_config_file, app_config_file)

This module generates parquets with regional annotation pathology data

INPUT PARAMETERS

app_config_file - path to yaml file containing application runtime parameters. See config.yaml.template

data_config_file - path to yaml file containing data input and output parameters. See dask_data_config.yaml.template

TABLE SCHEMA

  • sv_project_id: project number in slide viewer

  • slideviewer_path: slide path based on slideviewer organization

  • slide_id: slide id. synonymous with image_id

  • user: username of the annotator for a given annotation. For all slides, we combine multiple annotations from different users for a slide. In this case, user is set to 'CONCAT' and bmp_filepath, npy_filepath are null.

  • bmp_filepath: file path to downloaded bmp annotation file

  • npy_filepath: file path to npy annotation file converted from bmp

  • geojson_path: file path to geojson file converted from numpy

  • date: creation date

  • labelset:

Source code in src/luna/pathology/slideviewer/regional_annotation/dask_generate.py
@click.command()
@click.option(
    "-d",
    "--data_config_file",
    default=None,
    type=click.Path(exists=True),
    help="path to yaml file containing data input and output parameters. "
    "See dask_data_config.yaml.template",
)
@click.option(
    "-a",
    "--app_config_file",
    default="config.yaml",
    type=click.Path(exists=True),
    help="path to yaml file containing application runtime parameters. "
    "See config.yaml.template",
)
def cli(data_config_file, app_config_file):
    """This module generates parquets with regional annotation pathology data

    INPUT PARAMETERS

    app_config_file - path to yaml file containing application runtime parameters. See config.yaml.template

    data_config_file - path to yaml file containing data input and output parameters. See dask_data_config.yaml.template

    TABLE SCHEMA

    - sv_project_id: project number in slide viewer

    - slideviewer_path: slide path based on slideviewer organization

    - slide_id: slide id. synonymous with image_id

    - user: username of the annotator for a given annotation. For all slides, we combine multiple annotations from
        different users for a slide. In this case, user is set to 'CONCAT' and bmp_filepath, npy_filepath are null.

    - bmp_filepath: file path to downloaded bmp annotation file

    - npy_filepath: file path to npy annotation file converted from bmp

    - geojson_path: file path to  geojson file converted from numpy

    - date: creation date

    - labelset:
    """
    logger = init_logger()

    # load configs
    cfg = ConfigSet(name="DATA_CFG", config_file=data_config_file)
    cfg = ConfigSet(name="APP_CFG", config_file=app_config_file)

    with CodeTimer(logger, "generate annotation geojson table"):
        logger.info("data template: " + data_config_file)
        logger.info("config_file: " + app_config_file)

        # copy app and data configuration to destination config dir
        config_location = const.CONFIG_LOCATION(cfg)
        os.makedirs(config_location, exist_ok=True)

        shutil.copy(app_config_file, os.path.join(config_location, "app_config.yaml"))
        shutil.copy(data_config_file, os.path.join(config_location, "data_config.yaml"))
        logger.info("config files copied to %s", config_location)

        failed = create_geojson_table()

        if failed:
            logger.error("GEOJSON table creation had errors. Exiting.")
            logger.error(failed)
            raise RuntimeError("GEOJSON table creation had errors. Exiting.")

        return
create_geojson_table()

Vectorizes npy array annotation file into polygons and builds GeoJson with the polygon features. Creates a geojson file per labelset. Combines multiple annotations from different users for a slide.

Returns:

Name Type Description
list

list of slide ids that failed

Source code in src/luna/pathology/slideviewer/regional_annotation/dask_generate.py
def create_geojson_table():
    """Vectorizes npy array annotation file into polygons and builds GeoJson with the polygon features.
    Creates a geojson file per labelset.
    Combines multiple annotations from different users for a slide.

    Returns:
        list: list of slide ids that failed
    """
    logger = logging.getLogger(__name__)

    failed = []
    # get application and data config variables
    cfg = ConfigSet()
    client = Client(n_workers=25, threads_per_worker=1, memory_limit=0.1)
    client.run(init_logger)
    logger.info(client)

    SLIDEVIEWER_API_URL = cfg.get_value("DATA_CFG::SLIDEVIEWER_API_URL")
    SLIDEVIEWER_CSV_FILE = cfg.get_value("DATA_CFG::SLIDEVIEWER_CSV_FILE")
    PROJECT_ID = cfg.get_value("DATA_CFG::PROJECT_ID")
    LANDING_PATH = cfg.get_value("DATA_CFG::LANDING_PATH")
    TMP_ZIP_DIR_NAME = cfg.get_value("DATA_CFG::REQUESTOR_DEPARTMENT") + "_tmp_zips"
    TMP_ZIP_DIR = os.path.join(LANDING_PATH, TMP_ZIP_DIR_NAME)
    SLIDE_BMP_DIR = os.path.join(LANDING_PATH, "regional_bmps")
    SLIDE_NPY_DIR = os.path.join(LANDING_PATH, "regional_npys")
    SLIDE_STORE_DIR = os.path.join(LANDING_PATH, "slides")
    TABLE_OUT_DIR = const.TABLE_LOCATION(cfg)

    os.makedirs(TABLE_OUT_DIR, exist_ok=True)
    logger.info("Table output directory = %s", TABLE_OUT_DIR)

    # setup variables needed for build geojson UDF
    contour_level = cfg.get_value("DATA_CFG::CONTOUR_LEVEL")

    # fetch full set of slideviewer slides for project
    slides = fetch_slide_ids(
        SLIDEVIEWER_API_URL,
        PROJECT_ID,
        const.CONFIG_LOCATION(cfg),
        SLIDEVIEWER_CSV_FILE,
    )
    df = pd.DataFrame(
        data=np.array(slides), columns=["slideviewer_path", "slide_id", "sv_project_id"]
    )

    # get users and labelsets for df explosion
    all_users_list = cfg.get_value("DATA_CFG::USERS")
    all_labelsets = cfg.get_value("DATA_CFG::LABEL_SETS")

    global params
    params = cfg.get_config_set("APP_CFG")

    bmp_jobs = []
    for _, row in df.iterrows():
        bmp_future = client.submit(
            check_slideviewer_and_download_bmp,
            row.sv_project_id,
            row.slideviewer_path,
            row.slide_id,
            all_users_list,
            SLIDE_BMP_DIR,
            SLIDEVIEWER_API_URL,
            TMP_ZIP_DIR,
        )
        bmp_jobs.append(bmp_future)

    json_jobs = []
    for bmp_future in as_completed(bmp_jobs):
        if bmp_future.result() is not None:
            json_future = client.submit(
                convert_slide_bitmap_to_geojson,
                bmp_future,
                all_labelsets,
                contour_level,
                SLIDE_NPY_DIR,
                SLIDE_STORE_DIR,
            )
            json_jobs.append(json_future)

    for json_future in as_completed(json_jobs):
        slide_id = -1
        try:
            if json_future.result() is not None:
                slide_id, data = json_future.result()
                if slide_id and data:
                    result_df = pd.DataFrame(data)
                    logger.info(result_df)
                    result_df.drop(columns="geojson").to_parquet(
                        f"{TABLE_OUT_DIR}/regional_annot_slice_slide={slide_id}.parquet"
                    )
                else:
                    failed.append(slide_id)
                    logger.warning(
                        "Empty geojson returned. this means either this was an empty slide or an error occured during geojson generate"
                    )
        except Exception:
            failed.append(slide_id)
            logger.warning(f"Something was wrong with future {json_future}, skipping.")

    client.shutdown()

    return failed

spatial

stats

Kfunction(p1XY, p2XY, radius, ls=False, count=True, intensity=[], distance=False, distance_scale=10.0)

Computes the Counting, Intensity, and experimental Intensity-Distance K functions

Parameters:

Name Type Description Default
p1XY ndarray

An Nx2 array representing the (X,Y) coordinates of cells with phenotype 1

required
p2XY ndarray

Same as p1XY but for phenotype 2 cells

required
radius (float, list[float])

The radius (or list of radii) to consider

required
ls bool

If True, returns an |radius|x|p1XY| 2D array representing the K function for each phenotype 1 cell for each radius. If False, returns the mean for each radius

False
count bool

By default, this function only computes the Counting K function. Can be disabled with count=False.

True
intensity ndarray

An array of length |p2XY| representing the intensity of each phenotype 2 cell. When passed in, this method will also compute the Intensity K function

[]
distance (bool): If an intensity array is passed in, then setting distance=True
          will compute the experimental Intensity-Distance K function
        distance_scale (float): Characteristic distance scale (usually approx. 1 cell length in the given units)

Returns:
        dict: a dictionary with keys ["count", "intensity", "distance"] and values corresponding to the result of each K function
Source code in src/luna/pathology/spatial/stats.py
def Kfunction(
    p1XY,
    p2XY,
    radius,
    ls=False,
    count=True,
    intensity=[],
    distance=False,
    distance_scale=10.0,
):
    """Computes the Counting, Intensity, and experimental
                Intensity-Distance K functions

    Args:
            p1XY (np.ndarray): An Nx2 array representing the (X,Y) coordinates of cells with phenotype 1
            p2XY (np.ndarray): Same as p1XY but for phenotype 2 cells
            radius (float, list[float]): The radius (or list of radii) to consider
            ls (bool): If True, returns an |radius|x|p1XY| 2D array representing the K function
                for each phenotype 1 cell for each radius. If False, returns the mean
                for each radius
            count (bool): By default, this function only computes the Counting K function.
                   Can be disabled with count=False.
            intensity (np.ndarray): An array of length |p2XY| representing the intensity of each
                       phenotype 2 cell. When passed in, this method will also compute
                       the Intensity K function
        distance (bool): If an intensity array is passed in, then setting distance=True
                  will compute the experimental Intensity-Distance K function
                distance_scale (float): Characteristic distance scale (usually approx. 1 cell length in the given units)

        Returns:
                dict: a dictionary with keys ["count", "intensity", "distance"] and values corresponding to the result of each K function
    """
    # Compute the distance matrix
    dists = cdist(p1XY, p2XY)

    # Turn radius into an array if it isn't one already
    try:
        iter(radius)
    except TypeError:
        radius = [radius]

    # Define the lambdas for each K function variant
    CKfunc = lambda mask: np.sum(mask, axis=1)
    IKfunc = lambda Imask: np.sum(Imask, axis=1)
    IDKfunc = lambda Imask: np.sum(
        Imask * (1 / (distance_scale + (dists / distance_scale) ** 3)), axis=1
    )

    # Compute the mask for each radius
    masks = [(dists <= r) for r in radius]

    # Calculate each K function
    Kdict = {}
    if count:
        CK = [CKfunc(mask) for mask in masks]
        Kdict["count"] = _ret(CK, ls)
    if len(intensity) > 0:
        assert len(intensity) == len(p2XY)
        Imasks = [mask * intensity for mask in masks]
        IK = [IKfunc(Imask) for Imask in Imasks]
        Kdict["intensity"] = _ret(IK, ls)
        if distance:
            IDK = [IDKfunc(Imask) for Imask in Imasks]
            Kdict["distance"] = _ret(IDK, ls)

    return Kdict

transforms

Higher-level transformation functions

generate_k_function_statistics(cell_paths, method_data, main_index=None)

Compute K-function spatial statistics on given cell-data

Parameters:

Name Type Description Default
cell_paths str or list[str]

paths to a single or multiple FOV regions

required
method_data dict

Configuration: "index": (str, optional) Column containting the patient/desired ID, if available (overrides main_index) "phenotype1" : { "name" : (str) Column name to query 'value' : (str) Phenotype string to match (e.g. CD68) }, "phenotype2" : { "name" : (str) Column name to query 'value' : (str) Phenotype string to match (e.g. panCK) }, "count" : (bool) Flag to compute counting stats. "radius" : (float) Radius cutoff "intensity" : (str, optional) Column containing intensity information "distance" : (bool) Flag to compute intensity-distance stats.

required

Returns:

Type Description

pd.DataFrame: spatial statistics aggregated over FOVs

Source code in src/luna/pathology/spatial/transforms.py
def generate_k_function_statistics(cell_paths, method_data, main_index=None):
    """
    Compute K-function spatial statistics on given cell-data

    Args:
        cell_paths (str or list[str]): paths to a single or multiple FOV regions
        method_data (dict): Configuration:
                "index": (str, optional) Column containting the patient/desired ID, if available (overrides main_index)
                "phenotype1" : {
                        "name" : (str) Column name to query
                        'value' : (str) Phenotype string to match (e.g. CD68)
                },
                "phenotype2" : {
                        "name" : (str) Column name to query
                        'value' : (str) Phenotype string to match (e.g. panCK)
                },
                "count" : (bool) Flag to compute counting stats.
                "radius" : (float) Radius cutoff
                "intensity" : (str, optional) Column containing intensity information
                "distance" : (bool) Flag to compute intensity-distance stats.

    Returns:
        pd.DataFrame: spatial statistics aggregated over FOVs
    """

    if type(cell_paths) == str:
        cell_paths = [cell_paths]

    print(cell_paths)

    agg_k_data = {}

    pheno1_col = method_data["phenotype1"]["name"]
    pheno1_val = method_data["phenotype1"]["value"]
    pheno2_col = method_data["phenotype2"]["name"]
    pheno2_val = method_data["phenotype2"]["value"]
    index_col = method_data.get("index", None)
    radius = method_data["radius"]
    count = method_data["count"]
    distance = method_data["distance"]
    intensity_col = method_data.get("intensity", None)

    indices = set()

    for cell_path in cell_paths:

        if Path(cell_path).suffix == ".parquet":
            df = pd.read_parquet(cell_path)
        elif Path(cell_path).suffix == ".csv":
            df = pd.read_csv(cell_path)
        else:
            raise RuntimeError(f"Invalid input data type {cell_path}")

        # Look up the index for this slice
        if index_col:
            index = df[method_data["index"]].iloc[0]
            indices.add(index)

        # Create the data arrays
        pheno1 = df[df[pheno1_col] == pheno1_val]
        pheno2 = df[df[pheno2_col] == pheno2_val]
        p1XY = np.array(pheno1[["Centroid X µm", "Centroid Y µm"]])
        p2XY = np.array(pheno2[["Centroid X µm", "Centroid Y µm"]])

        if intensity_col:
            intensity = np.array(pheno2[intensity_col])
        else:
            intensity = []
            if distance:
                raise RuntimeError(
                    "Can't compute intensity-distance function without intensity information"
                )

        if p1XY.size == 0:
            print(
                f"WARNING: List of phenotype 1 cells ({pheno1_val}) is empty for {index}"
            )
        if p2XY.size == 0:
            print(
                f"WARNING: List of phenotype 2 cells ({pheno2_val}) is empty for {index}"
            )

        # Compute the K function
        print(f"Running... {cell_path}")

        fov_k_data = Kfunction(
            p1XY,
            p2XY,
            radius,
            ls=True,
            count=count,
            intensity=intensity,
            distance=distance,
        )

        for key in fov_k_data:
            if key in agg_k_data:
                np.append(agg_k_data[key], fov_k_data[key])
            else:
                agg_k_data[key] = fov_k_data[key]

    data_out = {}

    for kfunct in agg_k_data.keys():
        arr = agg_k_data[kfunct]
        if len(arr) == 0:
            arr = [0]
        data_out.update(
            {
                f"For_{pheno1_val}_Find_{pheno2_val}_at{radius}_{kfunct}_{intensity_col}_mean": np.mean(
                    arr
                ),
                f"For_{pheno1_val}_Find_{pheno2_val}_at{radius}_{kfunct}_{intensity_col}_variance": np.var(
                    arr
                ),
                f"For_{pheno1_val}_Find_{pheno2_val}_at{radius}_{kfunct}_{intensity_col}_skew": scipy.stats.skew(
                    arr
                ),
                f"For_{pheno1_val}_Find_{pheno2_val}_at{radius}_{kfunct}_{intensity_col}_kurtosis": scipy.stats.kurtosis(
                    arr
                ),
            }
        )

    df_slice_out = pd.DataFrame(data_out, index=[0]).astype(np.float64)

    if main_index is None:
        if not len(indices) == 1:
            raise RuntimeError(
                f"Multiple cell maps with different indices! Found: {indices}"
            )
        main_index = indices.pop()

    df_slice_out["main_index"] = main_index
    df_slice_out = df_slice_out.set_index("main_index")

    print(df_slice_out)

    return df_slice_out