#!/usr/bin/env python3
"""Helper functions for processing data with datapipe-testbench."""
import itertools
import logging
from pathlib import Path
import graphviz
from tqdm.auto import tqdm
from .benchmark import Benchmark
from .inputdataset import InputDataset
from .store import MetricsStore, ResultStore
from .utils import CATEGORY_NAME_MAP
from .visualization import graphviz_inputs_to_benchmarks
LOGGER = logging.getLogger("datapipe-testbench")
__all__ = [
"print_benchmark_info",
"generate_all_metrics",
"load_experiments",
"run_comparison_study",
"visualize_comparison_study",
"rename_telescope_type",
"print_telescope_type_transforms",
"InputDataset",
"ResultStore",
]
[docs]
def rename_telescope_type(old_name: str, new_name: str) -> None:
"""
Rename a telescope type, to handle cases where the name changed in the simulations.
Get the current list of re-mappings with print_telescope_type_transforms()
"""
CATEGORY_NAME_MAP[old_name] = new_name
[docs]
def print_benchmark_info(benchmark: Benchmark) -> None:
"""Describe a given Benchmark."""
print(benchmark.name)
print("-" * 40)
print(benchmark.__doc__)
print("Input Files Required for Metric Generation:")
for req in list(benchmark.required_inputs):
print(f" * {req}")
print()
print("Output Metrics:")
for name, output in benchmark.outputs().items():
print(f" * {name:20s}: {output}")
print()
def load_experiments(
experiment_names: list[str], experiments_path: Path | str
) -> list[MetricsStore]:
"""
Get the MetricStores for a list of previously-generated experiments.
Parameters
----------
experiment_names: list[str]
list of names of experiments within experiments_path. These will be the
name of the InputDataset used to write them if you used
`generate_all_metrics()`
experiements_path: Path | str
Where your experiments are stored.
Returns
-------
list[MetricsStore]:
List containing the MetricsStore for each experiment in the experiment_names list.
"""
experiments_path = Path(experiments_path)
metric_store_list = []
for name in experiment_names:
path = experiments_path / name
if not path.exists():
raise FileNotFoundError(f"Couldn't find experiment: {path}")
metric_store_list.append(MetricsStore(path))
return metric_store_list
[docs]
def generate_all_metrics(
input_dataset_list: list[InputDataset],
benchmark_list: list[Benchmark],
experiments_path: Path | str,
skip_existing=False,
) -> list[MetricsStore]:
"""
Generate all metrics for a given list of inputs and benchmarks.
This calls `datapipe_testbench.benchmark.Benchmark.generate_metrics` for each input dataset. The output
will be stored in experiments_path, with subdirectories created using the
name field of each `InputDataset`. The subdirectories will contain the
generated metrics.
Parameters
----------
input_dataset_list : list[InputDataset]
Input information for each experiment
benchmark_list : list[Benchmark]
Which benchmarks to generate metrics for
experiments_path : Path | str
Where to write out the metrics. Inside,
a subdirectory for each inputdataset will be
generated
skip_existing: bool
If True, don't re-generate metrics that already exist.
Note that this does not detect changes in the Benchmark
options, it only checks if the required outputs already exist.
Returns
-------
list[MetricsStore]:
list of generated metrics, corresponding to each InputDataset.
These can also be loaded up at a future time.
"""
experiments_path = Path(experiments_path).expanduser()
# load the stores for each experiment and check if we have the required data
# for the requested benchmarks.
print("Processing:")
print(f" * Benchmarks: {[x.name for x in benchmark_list]}")
print(f" * Inputs: {[x.name for x in input_dataset_list]}")
# check inputs
for benchmark, input_data in itertools.product(benchmark_list, input_dataset_list):
benchmark.check_input_dataset(input_data)
# Initialize metric stores so we can read the data:
metric_store_list = [
MetricsStore.from_path_and_input_dataset(
experiments_path / input_data.name, input_data
)
for input_data in input_dataset_list
]
# Process the data for each combination.
to_process = list(itertools.product(metric_store_list, benchmark_list))
for metric_store, benchmark in tqdm(to_process, desc="generating"):
print(f"{metric_store.name} -- {benchmark.name}")
if benchmark.missing_outputs(metric_store) or not skip_existing:
benchmark.generate_metrics(metric_store)
else:
print(" --> already exists, skipping...")
return metric_store_list
[docs]
def run_comparison_study(
name: str,
experiment_names: list[str],
benchmark_list: list[Benchmark],
experiments_path: Path | str,
studies_path: Path | str,
metadata: dict | None = None,
) -> ResultStore:
"""
Generate output plots comparing the list of Metrics for the given Benchmarks.
The outputs will be written to ``{studies_path}/{name}/*``
Parameters
----------
name: str
Name of this study
experiment_names: list[str]
list of names of experiments within experiments_path. These will be the
name of the InputDataset used to write them if you used
`generate_all_metrics()`
benchmark_list : list[Benchmark]
List of Benchmarks to generate plots for.
experiments_path: Path | str
Base path of where your experiments are stored
studies_path : Path | str
Base path of where to store study results. The output will be
in a subdirectory of this path by name.
metadata: dict | None
any other metadata you want to store with this study
Returns
-------
ResultStore:
the output
"""
studies_path = Path(studies_path).expanduser()
experiments_path = Path(experiments_path).expanduser()
result_store = ResultStore(base_path=studies_path / name, name=name)
if metadata:
result_store.metadata.update(metadata)
experiments = load_experiments(
experiment_names=experiment_names, experiments_path=experiments_path
)
for benchmark in tqdm(benchmark_list, desc="benchmarking"):
benchmark.compare_to_reference(experiments, result_store)
# also writeout the visualization
input_dataset_list = [ms.get_inputdata() for ms in experiments]
digraph = graphviz_inputs_to_benchmarks(
input_dataset_list=input_dataset_list, benchmark_list=benchmark_list
)
digraph.render(filename="study.dot", directory=result_store.base_path, format="pdf")
return result_store
[docs]
def visualize_comparison_study(
experiment_names: list[str],
benchmark_list: list[Benchmark],
experiments_path: Path | str,
file_length: int = 40,
) -> graphviz.Digraph:
"""Show an overview of this study."""
experiments = load_experiments(
experiment_names=experiment_names, experiments_path=experiments_path
)
input_dataset_list = [ms.get_inputdata() for ms in experiments]
return graphviz_inputs_to_benchmarks(
input_dataset_list=input_dataset_list,
benchmark_list=benchmark_list,
file_length=file_length,
)