#!/usr/bin/env python3
"""
Defines what is a Benchmark.
"""
import logging
import re
import warnings
from abc import abstractmethod
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path
from matplotlib.figure import FigureBase
from .inputdataset import InputDataset
from .store import MetricsStore, ResultStore
__all__ = ["Benchmark", "BenchmarkResult", "ComparisonStatus", "MissingInputError"]
[docs]
class ComparisonStatus(str, Enum):
"""Status of a benchmark check."""
PASSED = auto() #: benchmark passed
WARNING = auto() #: benchmark passed, but only barely, should check
FAILED = auto() #: benchmark failed
OTHER = auto() #: other failure, or couldn't complete benchmark
[docs]
@dataclass
class BenchmarkResult:
"""Output of a benchmark is one or more of these."""
benchmark_name: str
reference_dataset_name: str
test_dataset_name: str
status: ComparisonStatus
comment: str
plots: list[FigureBase] | None
[docs]
class Benchmark:
"""Defines what a "scientist" should define for a given benchmark.
A benchmark is defined by two functions:
1. ``generate_metrics()``, which will be used to transform event data into one or more ``Metric``
stored in a ``MetricsStore`` corresponding to a single set of processed events.
2. ``compare_to_reference()``, which compares one or more MetricsStores to a
reference MetricsStore. This function is run after there are multiple MetricsStores
available that have been processed with ``generate_metrics``.
"""
def __init__(self):
super().__init__()
self._log = logging.getLogger(self.__class__.__name__)
@property
@abstractmethod
def required_inputs(self) -> set[str]:
"""Return the set of required keys in the InputDataset that this Benchmark needs.
These are checked before generating metrics
"""
return set()
[docs]
@abstractmethod
def generate_metrics(self, metric_store: MetricsStore) -> dict | None:
"""Produce metrics for this benchmark later comparison.
Called once per benchmark per ``MetricsStore``, i.e. for one set of input
events, defines how to transform the event into to a metrics stored in a
``MetricsStore`` that can later be compared.
Parameters
----------
metric_store: MetricsStore
Where to store the metrics generated by this ``Benchmark``. It must be
initialized with an `datapipe_testbench.inputdataset.InputDataset`
containing the inputs to use for the transformation into metrics.
"""
[docs]
@abstractmethod
def compare_to_reference(
self, metric_store_list: list[MetricsStore], result_store: ResultStore
) -> dict | None:
"""Perform the comparison study on metrics associated with a set of experiments.
This function takes a list of MetricStores that have been previously
filled by `datapipe_testbench.benchmark.Benchmark.generate_metrics()`,
generates plots and comparison results, and writes the output to a
ResultsStore. If there are more than one MetricStore to compare, the
first one is considered the _reference_.
Parameters
----------
metric_store_list: list[MetricsStore]
The list of MetricStores to compare, the first of which is the _reference_ .
result_store: ResultStore
WHere the results of the comparison study are stored.
"""
# TODO: some subclasses return a list of BenchmarkResults here, Check how used
[docs]
@abstractmethod
def make_report(self, result_store: ResultStore):
"""Make a report for the benchmark using the results of the comparisons.
The result_store must already contain the outputs of
``compare_to_reference()``.
Parameters
----------
result_store : ResultStore
Contains the input comparisons and is where the report will be stored.
"""
def _compare_metrics_in_stores(self, metric_store_list, result_store) -> dict:
"""Compare metrics loaded from stores.
Helper function for use within ``compare_to_reference()`` when possible.
Parameters
----------
metric_store_list : list[MetricsStore]
The stores containing the metrics to compare. The first store in the list is considered
the reference.
result_store : ResultStore
The store into which comparison results will be saved.
Returns
-------
dict[str, BenchmarkResult]
List of benchmark results. Note there may be more of these than there are
MetricsStores if the benchmark is applied for example to multiple telescope types,
there would be one per telescope type per MetricsStore.
"""
# setup space to load the histograms into a dict
metrics = {k: [] for k in self.output_names.keys()}
for name, metric in metrics.items():
for ds in metric_store_list:
metric.append(ds.retrieve_data(self.output_names[name]))
metric[-1].label = ds.name
comparisons = dict()
for name, metric in metrics.items():
ref, others = metric[0], metric[1:]
compared = ref.compare(others)
print(f"Ref metric {ref.name}, plots {compared.plots.keys()}")
print(f"Output saved in {compared.name}")
compared.store(result_store, self.metric_path)
comparisons[name] = compared
return comparisons
@property
def name(self):
"""Return friendly name of this benchmark."""
parts = re.sub(
"([A-Z][a-z]+)", r" \1", re.sub("([A-Z]+)", r" \1", self.__class__.__name__)
).split()
if parts[-1] != "Benchmark":
parts.append("Benchmark")
return " ".join(parts)
[docs]
@abstractmethod
def outputs(self) -> dict[str, Path]:
"""Return mapping of key to Path of all outputs that this Benchmark generates."""
if hasattr(self, "output_names"):
warnings.warn(
"This benchmark should implement outputs() and not self.output_names",
category=DeprecationWarning,
)
return getattr(self, "output_names")
[docs]
def missing_outputs(self, metric_store: MetricsStore):
"""Return list of missing outputs."""
missing = []
for name, path in self.outputs().items():
if not metric_store.data_exists(path):
missing.append(name)
return missing