Source code for datapipe_testbench.processing

#!/usr/bin/env python3

"""Helper functions for processing data with datapipe-testbench."""

import itertools
import logging
from pathlib import Path

import graphviz
from tqdm.auto import tqdm

from .benchmark import Benchmark
from .inputdataset import InputDataset
from .store import MetricsStore, ResultStore
from .utils import CATEGORY_NAME_MAP
from .visualization import graphviz_inputs_to_benchmarks

LOGGER = logging.getLogger("datapipe-testbench")


__all__ = [
    "print_benchmark_info",
    "generate_all_metrics",
    "load_experiments",
    "run_comparison_study",
    "visualize_comparison_study",
    "rename_telescope_type",
    "print_telescope_type_transforms",
    "InputDataset",
    "ResultStore",
]



[docs]
def rename_telescope_type(old_name: str, new_name: str) -> None:
    """
    Rename a telescope type, to handle cases where the name changed in the simulations.

    Get the current list of re-mappings with print_telescope_type_transforms()
    """
    CATEGORY_NAME_MAP[old_name] = new_name




[docs]
def print_telescope_type_transforms() -> None:
    """Print the telescope type mapping."""
    for old_name, new_name in CATEGORY_NAME_MAP.items():
        print(f"* {old_name:>30s} -> {new_name:<30s}")




[docs]
def print_benchmark_info(benchmark: Benchmark) -> None:
    """Describe a given Benchmark."""
    print(benchmark.name)
    print("-" * 40)
    print(benchmark.__doc__)
    print("Input Files Required for Metric Generation:")
    for req in list(benchmark.required_inputs):
        print(f" * {req}")
    print()
    print("Output Metrics:")
    for name, output in benchmark.outputs().items():
        print(f" * {name:20s}: {output}")
    print()



def load_experiments(
    experiment_names: list[str], experiments_path: Path | str
) -> list[MetricsStore]:
    """
    Get the MetricStores for a list of previously-generated experiments.

    Parameters
    ----------
    experiment_names: list[str]
        list of names of experiments within experiments_path. These will be the
        name of the InputDataset used to write them if you used
        `generate_all_metrics()`
    experiements_path: Path | str
       Where your experiments are stored.

    Returns
    -------
    list[MetricsStore]:
        List containing the MetricsStore for each experiment in the experiment_names list.

    """
    experiments_path = Path(experiments_path)
    metric_store_list = []

    for name in experiment_names:
        path = experiments_path / name
        if not path.exists():
            raise FileNotFoundError(f"Couldn't find experiment: {path}")
        metric_store_list.append(MetricsStore(path))
    return metric_store_list



[docs]
def generate_all_metrics(
    input_dataset_list: list[InputDataset],
    benchmark_list: list[Benchmark],
    experiments_path: Path | str,
    skip_existing=False,
) -> list[MetricsStore]:
    """
    Generate all metrics for a given list of inputs and benchmarks.

    This calls `datapipe_testbench.benchmark.Benchmark.generate_metrics` for each input dataset. The output
    will be stored in experiments_path, with subdirectories created using the
    name field of each `InputDataset`. The subdirectories will contain the
    generated metrics.


    Parameters
    ----------
    input_dataset_list : list[InputDataset]
        Input information for each experiment

    benchmark_list : list[Benchmark]
        Which benchmarks to generate metrics for

    experiments_path : Path | str
        Where to write out the metrics. Inside,
        a subdirectory for each inputdataset will be
        generated

    skip_existing: bool
        If True, don't re-generate metrics that already exist.
        Note that this does not detect changes in the Benchmark
        options, it only checks if the required outputs already exist.

    Returns
    -------
    list[MetricsStore]:
        list of generated metrics, corresponding to each InputDataset.
        These can also be loaded up at a future time.
    """
    experiments_path = Path(experiments_path).expanduser()

    # load the stores for each experiment and check if we have the required data
    # for the requested benchmarks.
    print("Processing:")
    print(f"  * Benchmarks: {[x.name for x in benchmark_list]}")
    print(f"  *     Inputs: {[x.name for x in input_dataset_list]}")

    # check inputs
    for benchmark, input_data in itertools.product(benchmark_list, input_dataset_list):
        benchmark.check_input_dataset(input_data)

    # Initialize metric stores so we can read the data:
    metric_store_list = [
        MetricsStore.from_path_and_input_dataset(
            experiments_path / input_data.name, input_data
        )
        for input_data in input_dataset_list
    ]

    # Process the data for each combination.
    to_process = list(itertools.product(metric_store_list, benchmark_list))
    for metric_store, benchmark in tqdm(to_process, desc="generating"):
        print(f"{metric_store.name} -- {benchmark.name}")
        if benchmark.missing_outputs(metric_store) or not skip_existing:
            benchmark.generate_metrics(metric_store)
        else:
            print("   --> already exists, skipping...")

    return metric_store_list




[docs]
def run_comparison_study(
    name: str,
    experiment_names: list[str],
    benchmark_list: list[Benchmark],
    experiments_path: Path | str,
    studies_path: Path | str,
    metadata: dict | None = None,
) -> ResultStore:
    """
    Generate output plots comparing the list of Metrics for the given Benchmarks.

    The outputs will be written to ``{studies_path}/{name}/*``

    Parameters
    ----------
    name: str
        Name of this study
    experiment_names: list[str]
        list of names of experiments within experiments_path. These will be the
        name of the InputDataset used to write them if you used
        `generate_all_metrics()`
    benchmark_list : list[Benchmark]
        List of Benchmarks to generate plots for.
    experiments_path: Path | str
        Base path of where your experiments are stored
    studies_path : Path | str
        Base path of where to store study results. The output will be
        in a subdirectory of this path by name.
    metadata: dict | None
        any other metadata you want to store with this study

    Returns
    -------
    ResultStore:
        the output

    """
    studies_path = Path(studies_path).expanduser()
    experiments_path = Path(experiments_path).expanduser()

    result_store = ResultStore(base_path=studies_path / name, name=name)
    if metadata:
        result_store.metadata.update(metadata)

    experiments = load_experiments(
        experiment_names=experiment_names, experiments_path=experiments_path
    )

    for benchmark in tqdm(benchmark_list, desc="benchmarking"):
        benchmark.compare_to_reference(experiments, result_store)

    # also writeout the visualization
    input_dataset_list = [ms.get_inputdata() for ms in experiments]
    digraph = graphviz_inputs_to_benchmarks(
        input_dataset_list=input_dataset_list, benchmark_list=benchmark_list
    )
    digraph.render(filename="study.dot", directory=result_store.base_path, format="pdf")

    return result_store




[docs]
def visualize_comparison_study(
    experiment_names: list[str],
    benchmark_list: list[Benchmark],
    experiments_path: Path | str,
    file_length: int = 40,
) -> graphviz.Digraph:
    """Show an overview of this study."""
    experiments = load_experiments(
        experiment_names=experiment_names, experiments_path=experiments_path
    )
    input_dataset_list = [ms.get_inputdata() for ms in experiments]
    return graphviz_inputs_to_benchmarks(
        input_dataset_list=input_dataset_list,
        benchmark_list=benchmark_list,
        file_length=file_length,
    )