Skip to content

slide_etl

Ingest slide by adding them to a file or s3 based storage location and generating metadata about them

Parameters:

Name Type Description Default
slide_urlpath str

path to slide image

'???'
project_name str

project name underwhich the slides should reside

''
comment str

comment and description of dataset

''
subset_csv_urlpath str

url/path to subset csv

''
debug_limit int

limit number of slides

0
output_urlpath str

url/path to output table

''
storage_options dict

storage options to pass to reading functions

{}
output_storage_options dict

storage options to pass to writing functions

{}
local_config str

url/path to YAML config file

''
no_copy bool

determines whether we copy slides to output_urlpath

False
metadata_extension(str)

file extension of generated metadata file (either 'csv' or 'parquet')

required
Source code in src/luna/pathology/cli/slide_etl.py
@timed
def cli(
    slide_urlpath: str = "???",
    project_name: str = "",
    comment: str = "",
    subset_csv_urlpath: str = "",
    debug_limit: int = 0,
    output_urlpath: str = "",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
    no_copy: bool = False,
    metadata_extension: str = "parquet",
):
    """Ingest slide by adding them to a file or s3 based storage location and generating metadata about them


    Args:
        slide_urlpath (str): path to slide image
        project_name (str): project name underwhich the slides should reside
        comment (str): comment and description of dataset
        subset_csv_urlpath (str): url/path to subset csv
        debug_limit (int): limit number of slides
        output_urlpath (str): url/path to output table
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): url/path to YAML config file
        no_copy (bool): determines whether we copy slides to output_urlpath
        metadata_extension(str): file extension of generated metadata file (either 'csv' or 'parquet')
    """

    config = get_config(vars())
    filesystem, slide_path = fsspec.core.url_to_fs(
        config["slide_urlpath"], **config["storage_options"]
    )
    slide_paths = []  # type: list[str]
    if any([slide_path.endswith(ext) for ext in VALID_SLIDE_EXTENSIONS]):
        slide_paths += slide_path
    else:
        for ext in VALID_SLIDE_EXTENSIONS:
            slide_paths += filesystem.glob(f"{slide_path}/*{ext}")

    if config["metadata_extension"]:
        extension = config["metadata_extension"].lower().replace(".", "")

    if config["subset_csv_urlpath"]:
        slide_paths = apply_csv_filter(
            slide_paths, config["subset_csv_urlpath"], config["storage_options"]
        )
    if config["debug_limit"] > 0:
        slide_paths = slide_paths[: config["debug_limit"]]

    configure_dask_client()

    if len(slide_paths) == 0:
        return None

    slide_urls = [filesystem.unstrip_protocol(slide_path) for slide_path in slide_paths]

    df = slide_etl(
        slide_urls,
        config["project_name"],
        config["comment"],
        config["storage_options"],
        config["output_urlpath"],
        config["output_storage_options"],
        config["no_copy"],
    )

    logger.info(df)
    if config["output_urlpath"]:
        output_filesystem, output_path = fsspec.core.url_to_fs(
            config["output_urlpath"], **config["output_storage_options"]
        )

        f = Path(output_path) / f"slide_ingest_{config['project_name']}.{extension}"
        with output_filesystem.open(f, "wb") as of:
            if extension == "csv":
                logger.info("Writing to csv file")
                df.to_csv(of)
            elif extension == "parquet":
                logger.info("Writing to parquet file")
                df.to_parquet(of)