Source code for baybe.simulation.lookup

"""Target lookup mechanisms."""

from __future__ import annotations

import logging
from collections.abc import Callable, Collection
from typing import Literal

import numpy as np
import pandas as pd

from baybe.simulation._imputation import impute_target_values
from baybe.targets.base import Target
from baybe.utils.dataframe import add_fake_measurements

_logger = logging.getLogger(__name__)



[docs]
def look_up_targets(
    queries: pd.DataFrame,
    targets: Collection[Target],
    lookup: pd.DataFrame | Callable[[pd.DataFrame], pd.DataFrame] | None,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
) -> None:
    """Add/fill target values in a dataframe using a lookup mechanism.

    Note:
        This does not create a new dataframe but modifies ``queries`` in-place.

    Args:
        queries: The dataframe to be modified. Its content must be compatible with the
            chosen lookup mechanism.
        targets: The targets whose values are to be looked up.
        lookup: The lookup mechanism. Can be one of the following choices:

            -   A dataframe mapping rows of ``queries`` to the corresponding target
                values. That is, it must contain the same columns as ``queries`` plus
                one additional column for each of the given target.
            -   A callable, providing target values for each row of ``queries``.
            -   ``None``. Produces fake values for all targets.
        impute_mode: Specifies how a missing lookup will be handled. Only relevant for
            dataframe lookups. Can be one of the following choices:

            - ``"error"``: An error will be thrown.
            - ``"worst"``: Imputes the worst available value for each target.
            - ``"best"``: Imputes the best available value for each target.
            - ``"mean"``: Imputes the mean value for each target.
            - ``"random"``: A random row will be used for the lookup.

    Raises:
        ValueError: If an unsupported lookup mechanism is provided.

    Example:
        >>> import pandas as pd
        >>> from baybe.targets.numerical import NumericalTarget
        >>> from baybe.simulation.lookup import look_up_targets
        >>>
        >>> targets = [NumericalTarget("target")]
        >>> df = pd.DataFrame({"x": [1, 2, 3]})
        >>> lookup_df = pd.DataFrame({"x": [1, 2], "target": [10, 20]})
        >>> look_up_targets(df, targets, lookup_df, impute_mode="mean")
        >>> print(df)
           x  target
        0  1    10.0
        1  2    20.0
        2  3    15.0
    """
    if lookup is None:
        add_fake_measurements(queries, targets)
    elif isinstance(lookup, Callable):
        _look_up_targets_from_callable(queries, lookup)
    elif isinstance(lookup, pd.DataFrame):
        _look_up_targets_from_dataframe(queries, targets, lookup, impute_mode)
    else:
        raise ValueError("Unsupported lookup mechanism.")



def _look_up_targets_from_callable(
    queries: pd.DataFrame, lookup: Callable[[pd.DataFrame], pd.DataFrame]
) -> None:
    """Look up target values by querying a callable."""
    df_targets = lookup(queries)
    queries[df_targets.columns] = df_targets


def _look_up_targets_from_dataframe(
    queries: pd.DataFrame,
    targets: Collection[Target],
    lookup: pd.DataFrame,
    impute_mode: Literal[
        "error", "worst", "best", "mean", "random", "ignore"
    ] = "error",
) -> None:
    """Look up target values from a dataframe."""
    # # IMPROVE: Although it's not too important for a simulation, this
    # #  could also be implemented for approximate matches"""

    target_names = [t.name for t in targets]

    all_match_vals = []
    for _, row in queries.iterrows():
        # IMPROVE: to the entire matching at once via a merge
        ind = lookup[
            (lookup.loc[:, row.index] == row).all(axis=1, skipna=False)
        ].index.values

        if len(ind) > 1:
            # More than two instances of this parameter combination
            # have been measured
            _logger.warning(
                "The lookup rows with indexes %s seem to be "
                "duplicates regarding parameter values. Choosing a "
                "random one.",
                ind,
            )
            match_vals = lookup.loc[np.random.choice(ind), target_names]

        elif len(ind) < 1:
            # Parameter combination cannot be looked up and needs to be
            # imputed.
            if impute_mode == "ignore":
                raise AssertionError(
                    "Something went wrong for impute_mode 'ignore'. "
                    "It seems the search space was not correctly "
                    "reduced before recommendations were generated."
                )
            if impute_mode == "error":
                raise IndexError(
                    f"Cannot look up target values for {row}. "
                    "No matching row found in the lookup dataframe."
                )
            match_vals = impute_target_values(targets, lookup, impute_mode)

        else:
            # Exactly one match has been found
            match_vals = lookup.loc[ind[0], target_names]

        # Collect the matches
        all_match_vals.append(match_vals.values)

    # Add the lookup values
    queries.loc[:, target_names] = np.asarray(all_match_vals)