Source code for datapipe_testbench.processing

#!/usr/bin/env python3

"""Helper functions for processing data with datapipe-testbench."""

import itertools
import logging
from pathlib import Path

import graphviz
from tqdm.auto import tqdm

from .benchmark import Benchmark
from .inputdataset import InputDataset
from .store import MetricsStore, ResultStore
from .utils import CATEGORY_NAME_MAP
from .visualization import graphviz_inputs_to_benchmarks

LOGGER = logging.getLogger("datapipe-testbench")


__all__ = [
    "print_benchmark_info",
    "generate_all_metrics",
    "load_experiments",
    "run_comparison_study",
    "visualize_comparison_study",
    "rename_telescope_type",
    "print_telescope_type_transforms",
    "InputDataset",
    "ResultStore",
]


[docs] def rename_telescope_type(old_name: str, new_name: str) -> None: """ Rename a telescope type, to handle cases where the name changed in the simulations. Get the current list of re-mappings with print_telescope_type_transforms() """ CATEGORY_NAME_MAP[old_name] = new_name
def load_experiments( experiment_names: list[str], experiments_path: Path | str ) -> list[MetricsStore]: """ Get the MetricStores for a list of previously-generated experiments. Parameters ---------- experiment_names: list[str] list of names of experiments within experiments_path. These will be the name of the InputDataset used to write them if you used `generate_all_metrics()` experiements_path: Path | str Where your experiments are stored. Returns ------- list[MetricsStore]: List containing the MetricsStore for each experiment in the experiment_names list. """ experiments_path = Path(experiments_path) metric_store_list = [] for name in experiment_names: path = experiments_path / name if not path.exists(): raise FileNotFoundError(f"Couldn't find experiment: {path}") metric_store_list.append(MetricsStore(path)) return metric_store_list
[docs] def generate_all_metrics( input_dataset_list: list[InputDataset], benchmark_list: list[Benchmark], experiments_path: Path | str, skip_existing=False, ) -> list[MetricsStore]: """ Generate all metrics for a given list of inputs and benchmarks. This calls `datapipe_testbench.benchmark.Benchmark.generate_metrics` for each input dataset. The output will be stored in experiments_path, with subdirectories created using the name field of each `InputDataset`. The subdirectories will contain the generated metrics. Parameters ---------- input_dataset_list : list[InputDataset] Input information for each experiment benchmark_list : list[Benchmark] Which benchmarks to generate metrics for experiments_path : Path | str Where to write out the metrics. Inside, a subdirectory for each inputdataset will be generated skip_existing: bool If True, don't re-generate metrics that already exist. Note that this does not detect changes in the Benchmark options, it only checks if the required outputs already exist. Returns ------- list[MetricsStore]: list of generated metrics, corresponding to each InputDataset. These can also be loaded up at a future time. """ experiments_path = Path(experiments_path).expanduser() # load the stores for each experiment and check if we have the required data # for the requested benchmarks. print("Processing:") print(f" * Benchmarks: {[x.name for x in benchmark_list]}") print(f" * Inputs: {[x.name for x in input_dataset_list]}") # check inputs for benchmark, input_data in itertools.product(benchmark_list, input_dataset_list): benchmark.check_input_dataset(input_data) # Initialize metric stores so we can read the data: metric_store_list = [ MetricsStore.from_path_and_input_dataset( experiments_path / input_data.name, input_data ) for input_data in input_dataset_list ] # Process the data for each combination. to_process = list(itertools.product(metric_store_list, benchmark_list)) for metric_store, benchmark in tqdm(to_process, desc="generating"): print(f"{metric_store.name} -- {benchmark.name}") if benchmark.missing_outputs(metric_store) or not skip_existing: benchmark.generate_metrics(metric_store) else: print(" --> already exists, skipping...") return metric_store_list
[docs] def run_comparison_study( name: str, experiment_names: list[str], benchmark_list: list[Benchmark], experiments_path: Path | str, studies_path: Path | str, metadata: dict | None = None, ) -> ResultStore: """ Generate output plots comparing the list of Metrics for the given Benchmarks. The outputs will be written to ``{studies_path}/{name}/*`` Parameters ---------- name: str Name of this study experiment_names: list[str] list of names of experiments within experiments_path. These will be the name of the InputDataset used to write them if you used `generate_all_metrics()` benchmark_list : list[Benchmark] List of Benchmarks to generate plots for. experiments_path: Path | str Base path of where your experiments are stored studies_path : Path | str Base path of where to store study results. The output will be in a subdirectory of this path by name. metadata: dict | None any other metadata you want to store with this study Returns ------- ResultStore: the output """ studies_path = Path(studies_path).expanduser() experiments_path = Path(experiments_path).expanduser() result_store = ResultStore(base_path=studies_path / name, name=name) if metadata: result_store.metadata.update(metadata) experiments = load_experiments( experiment_names=experiment_names, experiments_path=experiments_path ) for benchmark in tqdm(benchmark_list, desc="benchmarking"): benchmark.compare_to_reference(experiments, result_store) # also writeout the visualization input_dataset_list = [ms.get_inputdata() for ms in experiments] digraph = graphviz_inputs_to_benchmarks( input_dataset_list=input_dataset_list, benchmark_list=benchmark_list ) digraph.render(filename="study.dot", directory=result_store.base_path, format="pdf") return result_store
[docs] def visualize_comparison_study( experiment_names: list[str], benchmark_list: list[Benchmark], experiments_path: Path | str, file_length: int = 40, ) -> graphviz.Digraph: """Show an overview of this study.""" experiments = load_experiments( experiment_names=experiment_names, experiments_path=experiments_path ) input_dataset_list = [ms.get_inputdata() for ms in experiments] return graphviz_inputs_to_benchmarks( input_dataset_list=input_dataset_list, benchmark_list=benchmark_list, file_length=file_length, )