Source code for baybe.simulation.scenarios

"""Batch simulation of multiple campaigns."""

from __future__ import annotations

import warnings
from collections.abc import Callable
from copy import deepcopy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Literal

import numpy as np
import pandas as pd

from baybe.campaign import Campaign
from baybe.exceptions import NothingToSimulateError, UnusedObjectWarning
from baybe.simulation.core import simulate_experiment

if TYPE_CHECKING:
    from xarray import DataArray

_DEFAULT_SEED = 1337



[docs]
def simulate_scenarios(
    scenarios: dict[Any, Campaign],
    lookup: pd.DataFrame | Callable | None = None,
    /,
    *,
    batch_size: int = 1,
    n_doe_iterations: int | None = None,
    initial_data: list[pd.DataFrame] | None = None,
    groupby: list[str] | None = None,
    n_mc_iterations: int = 1,
    random_seed: int | None = None,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
    noise_percent: float | None = None,
) -> pd.DataFrame:
    """Simulate multiple Bayesian optimization scenarios.

    A wrapper function around :func:`baybe.simulation.core.simulate_experiment` that
    allows to specify multiple simulation settings at once.

    Args:
        scenarios: A dictionary mapping scenario identifiers to DOE specifications.
        lookup: See :func:`baybe.simulation.core.simulate_experiment`.
        batch_size: See :func:`baybe.simulation.core.simulate_experiment`.
        n_doe_iterations: See :func:`baybe.simulation.core.simulate_experiment`.
        initial_data: A list of initial data sets for which the scenarios should be
            simulated.
        groupby: The names of the parameters to be used to partition the search space.
            A separate simulation will be conducted for each partition, with the search
            restricted to that partition.
        n_mc_iterations: The number of Monte Carlo simulations to be used.
        random_seed: An optional integer specifying the random seed for the first Monte
            Carlo run. Each subsequent runs will increase this value by 1. If omitted,
            the current random seed is used.
        impute_mode: See :func:`baybe.simulation.core.simulate_experiment`.
        noise_percent: See :func:`baybe.simulation.core.simulate_experiment`.

    Returns:
        A dataframe like returned from :func:`baybe.simulation.core.simulate_experiment`
        but with additional columns. See the ``Note`` for details.

    Note:
        The following additional columns are contained in the dataframe returned by this
        function:

        * ``Scenario``: Specifies the scenario identifier of the respective simulation.
        * ``Monte_Carlo_Run``: Specifies the Monte Carlo repetition of the
          respective simulation.
        * Optional, if ``random_seed`` is provided: A column ``Random_Seed`` that
          specifies the random seed used for the respective simulation.
        * Optional, if ``initial_data`` is provided: A column ``Initial_Data`` that
          specifies the index of the initial data set used for the respective
          simulation.
        * Optional, if ``groupby`` is provided: A column for each ``groupby`` parameter
          that specifies the search space partition considered for the respective
          simulation.
    """

    @dataclass
    class SimulationResult:
        """A thin wrapper to enable dataframe-valued return values with xyzpy.

        Args:
            result: The result of the simulation.
        """

        result: pd.DataFrame

    def make_xyzpy_callable(result_variable: str) -> Callable:
        """Make a batch simulator that allows running campaigns in parallel."""
        from baybe._optional.simulation import xyzpy

        @xyzpy.label(var_names=[result_variable])
        def simulate(
            Scenario: str,
            Monte_Carlo_Run: int,
            Initial_Data=None,
        ):
            """Callable for xyzpy simulation."""
            data = None if initial_data is None else initial_data[Initial_Data]
            seed = None if random_seed is None else Monte_Carlo_Run + _DEFAULT_SEED
            result = _simulate_groupby(
                scenarios[Scenario],
                lookup,
                batch_size=batch_size,
                n_doe_iterations=n_doe_iterations,
                initial_data=data,
                groupby=groupby,
                random_seed=seed,
                impute_mode=impute_mode,
                noise_percent=noise_percent,
            )
            if random_seed is not None:
                result["Random_Seed"] = seed
            return SimulationResult(result)

        return simulate

    def unpack_simulation_results(array: DataArray) -> pd.DataFrame:
        """Turn the xyzpy simulation results into a flat dataframe."""
        # Convert to dataframe and remove the wrapper layer
        series = array.to_series()
        series = series.apply(lambda x: x.result)

        # Un-nest all simulation results
        dfs = []
        for setting, df_result in series.items():
            df_setting = pd.DataFrame(
                [setting], columns=series.index.names, index=df_result.index
            )
            dfs.append(pd.concat([df_setting, df_result], axis=1))

        # Concatenate all results into a single dataframe
        return pd.concat(dfs, ignore_index=True)

    # Collect the settings to be simulated
    combos = {"Scenario": scenarios.keys()}
    combos["Monte_Carlo_Run"] = range(n_mc_iterations)
    if initial_data:
        combos["Initial_Data"] = range(len(initial_data))

    # Simulate and unpack
    result_variable = "simulation_result"
    batch_simulator = make_xyzpy_callable(result_variable)

    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            category=UnusedObjectWarning,
            module="baybe.recommenders.pure.nonpredictive.base",
        )
        da_results = batch_simulator.run_combos(combos)[result_variable]

    df_results = unpack_simulation_results(da_results)

    return df_results



def _simulate_groupby(
    campaign: Campaign,
    lookup: pd.DataFrame | Callable[..., tuple[float, ...]] | None = None,
    /,
    *,
    batch_size: int = 1,
    n_doe_iterations: int | None = None,
    initial_data: pd.DataFrame | None = None,
    groupby: list[str] | None = None,
    random_seed: int = _DEFAULT_SEED,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
    noise_percent: float | None = None,
) -> pd.DataFrame:
    """Scenario simulation for different search space partitions.

    A wrapper around :func:`baybe.simulation.core.simulate_experiment` that allows to
    partition the search space into different groups and run separate simulations for
    all groups where the search is restricted to the corresponding partition.

    Args:
        campaign: See :func:`baybe.simulation.core.simulate_experiment`.
        lookup: See :func:`baybe.simulation.core.simulate_experiment`.
        batch_size: See :func:`baybe.simulation.core.simulate_experiment`.
        n_doe_iterations: See :func:`baybe.simulation.core.simulate_experiment`.
        initial_data: See :func:`baybe.simulation.core.simulate_experiment`.
        groupby: See :func:`baybe.simulation.scenarios.simulate_scenarios`.
        random_seed: See :func:`baybe.simulation.core.simulate_experiment`.
        impute_mode: See :func:`baybe.simulation.core.simulate_experiment`.
        noise_percent: See :func:`baybe.simulation.core.simulate_experiment`.

    Returns:
        A dataframe like returned from
        :func:`baybe.simulation.core.simulate_experiments`, but with additional
        ``groupby columns`` (named according to the specified groupby parameters) that
        subdivide the results into the different simulations.

    Raises:
        NothingToSimulateError: If there is nothing to simulate.
    """
    # Create the groups. If no grouping is specified, use a single group containing
    # all parameter configurations.
    # NOTE: In the following, we intentionally work with *integer* indexing (iloc)
    #   instead of pandas indexes (loc), because the latter would yield wrong
    #   results in cases where the search space dataframe contains duplicate
    #   index entries (i.e., controlling the recommendable entries would affect
    #   all duplicates). While duplicate entries should be prevented by the search
    #   space constructor, the integer-based indexing provides a second safety net.
    #   Hence, the "reset_index" call.
    if groupby is None:
        groups = ((None, campaign.searchspace.discrete.exp_rep.reset_index()),)
    else:
        groups = campaign.searchspace.discrete.exp_rep.reset_index().groupby(groupby)

    # Simulate all subgroups
    dfs = []
    for group_id, group in groups:
        # Create a campaign that focuses only on the current group by excluding
        # off-group configurations from the candidates list
        # TODO: Reconsider if deepcopies are required once [16605] is resolved
        campaign_group = deepcopy(campaign)
        # TODO: Implement SubspaceDiscrete.__len__
        off_group_idx = np.full(
            len(campaign.searchspace.discrete.exp_rep), fill_value=True, dtype=bool
        )
        off_group_idx[group.index.values] = False
        # TODO [16605]: Avoid direct manipulation of metadata
        campaign_group.searchspace.discrete.metadata.loc[
            off_group_idx, "dont_recommend"
        ] = True

        # Run the group simulation
        try:
            df_group = simulate_experiment(
                campaign_group,
                lookup,
                batch_size=batch_size,
                n_doe_iterations=n_doe_iterations,
                initial_data=initial_data,
                random_seed=random_seed,
                impute_mode=impute_mode,
                noise_percent=noise_percent,
            )
        except NothingToSimulateError:
            continue

        # Add the group columns
        if groupby is not None:
            group_tuple = group_id if isinstance(group_id, tuple) else (group_id,)
            context = pd.DataFrame([group_tuple], columns=groupby, index=df_group.index)
            df_group = pd.concat([context, df_group], axis=1)

        dfs.append(df_group)

    # Collect all results
    if len(dfs) == 0:
        raise NothingToSimulateError
    df = pd.concat(dfs, ignore_index=True)

    return df