Source code for baybe.simulation.lookup

"""Target lookup mechanisms."""

from __future__ import annotations

import logging
from collections.abc import Callable, Collection
from typing import Literal

import numpy as np
import pandas as pd

from baybe.simulation._imputation import _impute_lookup
from baybe.targets.base import Target
from baybe.utils.dataframe import add_fake_results

_logger = logging.getLogger(__name__)



[docs]
def look_up_targets(
    queries: pd.DataFrame,
    targets: Collection[Target],
    lookup: pd.DataFrame | Callable | None,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
) -> None:
    """Add/fill target values in a dataframe using a lookup mechanism.

    Note:
        This does not create a new dataframe but modifies ``queries`` in-place.

    Args:
        queries: The dataframe to be modified. Its content must be compatible with the
            chosen lookup mechanism.
        targets: The targets whose values are to be looked up.
        lookup: The lookup mechanism. Can be one of the following choices:

            -   A dataframe mapping rows of ``queries`` to the corresponding target
                values. That is, it must contain the same columns as ``queries`` plus
                one additional column for each of the given target.
            -   A callable, providing target values for each row of ``queries``.
            -   ``None``. Produces fake values for all targets.
        impute_mode: Specifies how a missing lookup will be handled. Only relevant for
            dataframe lookups. Can be one of the following choices:

            - ``"error"``: An error will be thrown.
            - ``"worst"``: Imputes the worst available value for each target.
            - ``"best"``: Imputes the best available value for each target.
            - ``"mean"``: Imputes the mean value for each target.
            - ``"random"``: A random row will be used for the lookup.

    Raises:
        ValueError: If an unsupported lookup mechanism is provided.

    Example:
        >>> import pandas as pd
        >>> from baybe.targets.numerical import NumericalTarget
        >>> from baybe.simulation.lookup import look_up_targets
        >>>
        >>> targets = [NumericalTarget("target", "MAX")]
        >>> df = pd.DataFrame({"x": [1, 2, 3]})
        >>> lookup_df = pd.DataFrame({"x": [1, 2], "target": [10, 20]})
        >>> look_up_targets(df, targets, lookup_df, impute_mode="mean")
        >>> print(df)
           x  target
        0  1    10.0
        1  2    20.0
        2  3    15.0
    """
    if lookup is None:
        add_fake_results(queries, targets)
    elif isinstance(lookup, Callable):
        _look_up_targets_from_callable(queries, targets, lookup)
    elif isinstance(lookup, pd.DataFrame):
        _look_up_targets_from_dataframe(queries, targets, lookup, impute_mode)
    else:
        raise ValueError("Unsupported lookup mechanism.")



def _look_up_targets_from_callable(
    queries: pd.DataFrame,
    targets: Collection[Target],
    lookup: Callable,
) -> None:
    """Look up target values by querying a callable."""
    # TODO: Currently, the alignment of return values to targets is based on the
    #   column ordering, which is not robust. Instead, the callable should return
    #   a dataframe with properly labeled columns.

    # Since the return of a lookup function is a tuple, the following code stores
    # tuples of floats in a single column with label 0:
    measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame()
    # We transform this column to a DataFrame in which there is an individual
    # column for each of the targets....
    split_target_columns = pd.DataFrame(
        measured_targets[0].to_list(), index=measured_targets.index
    )
    # ... and assign this to measured_targets in order to have one column per target
    measured_targets[split_target_columns.columns] = split_target_columns
    if measured_targets.shape[1] != len(targets):
        raise AssertionError(
            "If you use an analytical function as lookup, make sure "
            "the configuration has the right amount of targets "
            "specified."
        )
    for k_target, target in enumerate(targets):
        queries[target.name] = measured_targets.iloc[:, k_target]


def _look_up_targets_from_dataframe(
    queries: pd.DataFrame,
    targets: Collection[Target],
    lookup: pd.DataFrame,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
) -> None:
    """Look up target values from a dataframe."""
    # # IMPROVE: Although it's not too important for a simulation, this
    # #  could also be implemented for approximate matches"""

    target_names = [t.name for t in targets]

    all_match_vals = []
    for _, row in queries.iterrows():
        # IMPROVE: to the entire matching at once via a merge
        ind = lookup[
            (lookup.loc[:, row.index] == row).all(axis=1, skipna=False)
        ].index.values

        if len(ind) > 1:
            # More than two instances of this parameter combination
            # have been measured
            _logger.warning(
                "The lookup rows with indexes %s seem to be "
                "duplicates regarding parameter values. Choosing a "
                "random one.",
                ind,
            )
            match_vals = lookup.loc[np.random.choice(ind), target_names].values

        elif len(ind) < 1:
            # Parameter combination cannot be looked up and needs to be
            # imputed.
            if impute_mode == "ignore":
                raise AssertionError(
                    "Something went wrong for impute_mode 'ignore'. "
                    "It seems the search space was not correctly "
                    "reduced before recommendations were generated."
                )
            match_vals = _impute_lookup(row, lookup, targets, impute_mode)

        else:
            # Exactly one match has been found
            match_vals = lookup.loc[ind[0], target_names].values

        # Collect the matches
        all_match_vals.append(match_vals)

    # Add the lookup values
    queries.loc[:, target_names] = np.asarray(all_match_vals)