slide_etl

Ingest slide by adding them to a file or s3 based storage location and generating metadata about them

Parameters:

Name	Type	Description	Default
`slide_urlpath`	`str`	path to slide image	`'???'`
`project_name`	`str`	project name underwhich the slides should reside	`''`
`comment`	`str`	comment and description of dataset	`''`
`subset_csv_urlpath`	`str`	url/path to subset csv	`''`
`debug_limit`	`int`	limit number of slides	`0`
`output_urlpath`	`str`	url/path to output table	`''`
`storage_options`	`dict`	storage options to pass to reading functions	`{}`
`output_storage_options`	`dict`	storage options to pass to writing functions	`{}`
`local_config`	`str`	url/path to YAML config file	`''`
`no_copy`	`bool`	determines whether we copy slides to output_urlpath	`False`
`metadata_extension(str)`		file extension of generated metadata file (either 'csv' or 'parquet')	required

Source code in src/luna/pathology/cli/slide_etl.py

@timed
def cli(
    slide_urlpath: str = "???",
    project_name: str = "",
    comment: str = "",
    subset_csv_urlpath: str = "",
    debug_limit: int = 0,
    output_urlpath: str = "",
    storage_options: dict = {},
    output_storage_options: dict = {},
    local_config: str = "",
    no_copy: bool = False,
    metadata_extension: str = "parquet",
):
    """Ingest slide by adding them to a file or s3 based storage location and generating metadata about them


    Args:
        slide_urlpath (str): path to slide image
        project_name (str): project name underwhich the slides should reside
        comment (str): comment and description of dataset
        subset_csv_urlpath (str): url/path to subset csv
        debug_limit (int): limit number of slides
        output_urlpath (str): url/path to output table
        storage_options (dict): storage options to pass to reading functions
        output_storage_options (dict): storage options to pass to writing functions
        local_config (str): url/path to YAML config file
        no_copy (bool): determines whether we copy slides to output_urlpath
        metadata_extension(str): file extension of generated metadata file (either 'csv' or 'parquet')
    """

    config = get_config(vars())
    filesystem, slide_path = fsspec.core.url_to_fs(
        config["slide_urlpath"], **config["storage_options"]
    )
    slide_paths = []  # type: list[str]
    if any([slide_path.endswith(ext) for ext in VALID_SLIDE_EXTENSIONS]):
        slide_paths += slide_path
    else:
        for ext in VALID_SLIDE_EXTENSIONS:
            slide_paths += filesystem.glob(f"{slide_path}/*{ext}")

    if config["metadata_extension"]:
        extension = config["metadata_extension"].lower().replace(".", "")

    if config["subset_csv_urlpath"]:
        slide_paths = apply_csv_filter(
            slide_paths, config["subset_csv_urlpath"], config["storage_options"]
        )
    if config["debug_limit"] > 0:
        slide_paths = slide_paths[: config["debug_limit"]]

    configure_dask_client()

    if len(slide_paths) == 0:
        return None

    slide_urls = [filesystem.unstrip_protocol(slide_path) for slide_path in slide_paths]

    df = slide_etl(
        slide_urls,
        config["project_name"],
        config["comment"],
        config["storage_options"],
        config["output_urlpath"],
        config["output_storage_options"],
        config["no_copy"],
    )

    logger.info(df)
    if config["output_urlpath"]:
        output_filesystem, output_path = fsspec.core.url_to_fs(
            config["output_urlpath"], **config["output_storage_options"]
        )

        f = Path(output_path) / f"slide_ingest_{config['project_name']}.{extension}"
        with output_filesystem.open(f, "wb") as of:
            if extension == "csv":
                logger.info("Writing to csv file")
                df.to_csv(of)
            elif extension == "parquet":
                logger.info("Writing to parquet file")
                df.to_parquet(of)