Source code for baybe.searchspace.discrete

"""Discrete subspaces."""

from __future__ import annotations

import os
import warnings
from collections.abc import Collection, Sequence
from itertools import compress
from math import prod
from typing import TYPE_CHECKING, Any

import numpy as np
import pandas as pd
from attr import define, field
from cattrs import IterableValidationError

from baybe.constraints import DISCRETE_CONSTRAINTS_FILTERING_ORDER, validate_constraints
from baybe.constraints.base import DiscreteConstraint
from baybe.exceptions import OptionalImportError
from baybe.parameters import (
    CategoricalParameter,
    NumericalDiscreteParameter,
    TaskParameter,
)
from baybe.parameters.base import DiscreteParameter, Parameter
from baybe.parameters.utils import get_parameters_from_dataframe
from baybe.searchspace.validation import (
    get_transform_parameters,
    validate_parameter_names,
    validate_parameters,
)
from baybe.serialization import SerialMixin, converter, select_constructor_hook
from baybe.utils.basic import to_tuple
from baybe.utils.boolean import eq_dataframe
from baybe.utils.dataframe import (
    df_drop_single_value_columns,
    fuzzy_row_match,
    pretty_print_df,
)
from baybe.utils.memory import bytes_to_human_readable
from baybe.utils.numerical import DTypeFloatNumpy

if TYPE_CHECKING:
    import polars as pl

    from baybe.searchspace.core import SearchSpace

_METADATA_COLUMNS = ["was_recommended", "was_measured", "dont_recommend"]



[docs]
@define(kw_only=True)
class MemorySize:
    """Estimated memory size of a :class:`SubspaceDiscrete`."""

    exp_rep_bytes: float
    """The memory size of the experimental representation dataframe in bytes."""

    exp_rep_shape: tuple[int, int]
    """The shape of the experimental representation dataframe."""

    comp_rep_bytes: float
    """The memory size of the computational representation dataframe in bytes."""

    comp_rep_shape: tuple[int, int]
    """The shape of the computational representation dataframe."""

    @property
    def exp_rep_human_readable(self) -> tuple[float, str]:
        """Human-readable memory size of the experimental representation dataframe.

        Consists of a tuple containing memory size and unit.
        """
        return bytes_to_human_readable(self.exp_rep_bytes)

    @property
    def comp_rep_human_readable(self) -> tuple[float, str]:
        """Human-readable memory size of the computational representation dataframe.

        Consists of a tuple containing memory size and unit.
        """
        return bytes_to_human_readable(self.comp_rep_bytes)




[docs]
@define
class SubspaceDiscrete(SerialMixin):
    """Class for managing discrete subspaces.

    Builds the subspace from parameter definitions and optional constraints, keeps
    track of search metadata, and provides access to candidate sets and different
    parameter views.
    """

    parameters: tuple[DiscreteParameter, ...] = field(
        converter=to_tuple, validator=lambda _, __, x: validate_parameter_names(x)
    )
    """The list of parameters of the subspace."""

    exp_rep: pd.DataFrame = field(eq=eq_dataframe)
    """The experimental representation of the subspace."""

    metadata: pd.DataFrame = field(eq=eq_dataframe)
    """The metadata."""

    empty_encoding: bool = field(default=False)
    """Flag encoding whether an empty encoding is used."""

    constraints: tuple[DiscreteConstraint, ...] = field(
        converter=to_tuple, factory=tuple
    )
    """A list of constraints for restricting the space."""

    comp_rep: pd.DataFrame = field(eq=eq_dataframe)
    """The computational representation of the space. Technically not required but added
    as an optional initializer argument to allow ingestion from e.g. serialized objects
    and thereby speed up construction. If not provided, the default hook will derive it
    from ``exp_rep``."""

    def __str__(self) -> str:
        if self.is_empty:
            return ""

        start_bold = "\033[1m"
        end_bold = "\033[0m"

        # Convert the lists to dataFrames to be able to use pretty_printing
        param_list = [param.summary() for param in self.parameters]
        constraints_list = [constr.summary() for constr in self.constraints]
        param_df = pd.DataFrame(param_list)
        constraints_df = pd.DataFrame(constraints_list)

        # Get summary information from metadata
        was_recommended_count = len(self.metadata[self.metadata[_METADATA_COLUMNS[0]]])
        was_measured_count = len(self.metadata[self.metadata[_METADATA_COLUMNS[1]]])
        dont_recommend_count = len(self.metadata[self.metadata[_METADATA_COLUMNS[2]]])
        metadata_count = len(self.metadata)

        # Put all attributes of the discrete class in one string.
        discrete_str = f"""{start_bold}Discrete Search Space{end_bold}
            \n{start_bold}Discrete Parameters{end_bold}\n{pretty_print_df(param_df)}
            \n{start_bold}Experimental Representation{end_bold}
            \r{pretty_print_df(self.exp_rep)}\n\n{start_bold}Metadata:{end_bold}
            \r{_METADATA_COLUMNS[0]}: {was_recommended_count}/{metadata_count}
            \r{_METADATA_COLUMNS[1]}: {was_measured_count}/{metadata_count}
            \r{_METADATA_COLUMNS[2]}: {dont_recommend_count}/{metadata_count}
            \n{start_bold}Constraints{end_bold}\n{pretty_print_df(constraints_df)}
            \n{start_bold}Computational Representation{end_bold}
            \r{pretty_print_df(self.comp_rep)}"""

        return discrete_str.replace("\n", "\n ").replace("\r", "\r ")

    @exp_rep.validator
    def _validate_exp_rep(  # noqa: DOC101, DOC103
        self, _: Any, exp_rep: pd.DataFrame
    ) -> None:
        """Validate the experimental representation.

        Raises:
            ValueError: If the index of the provided dataframe contains duplicates.
        """
        if exp_rep.index.has_duplicates:
            raise ValueError(
                "The index of this search space contains duplicates. "
                "This is not allowed, as it can lead to hard-to-detect bugs."
            )

    @metadata.default
    def _default_metadata(self) -> pd.DataFrame:
        """Create the default metadata."""
        # If the discrete search space is empty, explicitly return an empty dataframe
        # instead of simply using a zero-length index. Otherwise, the Boolean dtype
        # would be lost during a serialization roundtrip as there would be no
        # data available that allows to determine the type, causing subsequent
        # equality checks to fail.
        # TODO: verify if this is still required
        if self.is_empty:
            return pd.DataFrame(columns=_METADATA_COLUMNS)

        # TODO [16605]: Redesign metadata handling
        # Exclude inactive tasks from search
        df = pd.DataFrame(False, columns=_METADATA_COLUMNS, index=self.exp_rep.index)
        off_task_idxs = ~self._on_task_configurations()
        df.loc[off_task_idxs.values, "dont_recommend"] = True  # type: ignore
        return df

    @metadata.validator
    def _validate_metadata(  # noqa: DOC101, DOC103
        self, _: Any, metadata: pd.DataFrame
    ) -> None:
        """Validate the metadata.

        Raises:
            ValueError: If the provided metadata allows testing parameter configurations
                for inactive tasks.
        """
        # We first check whether there are actually any parameters that need to be
        # checked.
        if self.is_empty:
            return
        off_task_idxs = ~self._on_task_configurations()
        if not metadata.loc[off_task_idxs.values, "dont_recommend"].all():  # type: ignore
            raise ValueError(
                "Inconsistent instructions given: The provided metadata allows "
                "testing parameter configurations for inactive tasks."
            )

    @comp_rep.default
    def _default_comp_rep(self) -> pd.DataFrame:
        """Create the default computational representation."""
        # Create a dataframe containing the computational parameter representation
        comp_rep = self.transform(self.exp_rep)

        # Ignore all columns that do not carry any covariate information
        # TODO[12758]: This logic needs to be refined, i.e. when should we drop columns
        #   and when not (can have undesired/unexpected side-effects). Should this be
        #   configurable at the parameter level? A hotfix was made to exclude task
        #   parameters, but this needs to be revisited as well.
        comp_rep = df_drop_single_value_columns(
            comp_rep, [p.name for p in self.parameters if isinstance(p, TaskParameter)]
        )

        return comp_rep

    def __attrs_post_init__(self) -> None:
        # TODO [16605]: Redesign metadata handling
        if self.is_empty:
            return
        off_task_idxs = ~self._on_task_configurations()
        self.metadata.loc[off_task_idxs.values, "dont_recommend"] = True  # type: ignore


[docs]
    def to_searchspace(self) -> SearchSpace:
        """Turn the subspace into a search space with no continuous part."""
        from baybe.searchspace.core import SearchSpace

        return SearchSpace(discrete=self)


    def _on_task_configurations(self) -> pd.Series:
        """Retrieve the parameter configurations for the active tasks."""
        # TODO [16932]: This only works for a single parameter
        try:
            task_param = next(
                p for p in self.parameters if isinstance(p, TaskParameter)
            )
        except StopIteration:
            return pd.Series(True, index=self.exp_rep.index)
        return self.exp_rep[task_param.name].isin(task_param.active_values)


[docs]
    @classmethod
    def empty(cls) -> SubspaceDiscrete:
        """Create an empty discrete subspace."""
        return SubspaceDiscrete(
            parameters=[],
            exp_rep=pd.DataFrame(),
            metadata=pd.DataFrame(columns=_METADATA_COLUMNS),
        )



[docs]
    @classmethod
    def from_parameter(cls, parameter: DiscreteParameter) -> SubspaceDiscrete:
        """Create a subspace from a single parameter.

        Args:
            parameter: The parameter to span the subspace.

        Returns:
            The created subspace.
        """
        return cls.from_product([parameter])



[docs]
    @classmethod
    def from_product(
        cls,
        parameters: Sequence[DiscreteParameter],
        constraints: Sequence[DiscreteConstraint] | None = None,
        empty_encoding: bool = False,
    ) -> SubspaceDiscrete:
        """See :class:`baybe.searchspace.core.SearchSpace`."""
        # Set defaults and order constraints
        constraints = constraints or []
        constraints = sorted(
            constraints,
            key=lambda x: DISCRETE_CONSTRAINTS_FILTERING_ORDER.index(x.__class__),
        )

        try:
            # Check for manual deactivation of polars
            if os.environ.get("BAYBE_DEACTIVATE_POLARS", None) is not None:
                raise OptionalImportError(
                    "Polars was deactivated manually via environment variable."
                )

            # Apply polars product and filtering
            lazy_df = parameter_cartesian_prod_polars(parameters)
            lazy_df, mask_missing = _apply_constraint_filter_polars(
                lazy_df, constraints
            )
            df_records = lazy_df.collect(streaming=True).to_dicts()
            df = pd.DataFrame.from_records(df_records)
        except OptionalImportError:
            # Apply pandas product
            df = parameter_cartesian_prod_pandas(parameters)
            mask_missing = [True] * len(constraints)

        # Gather and use constraints not yet applied
        _apply_constraint_filter_pandas(df, list(compress(constraints, mask_missing)))

        return SubspaceDiscrete(
            parameters=parameters,
            constraints=constraints,
            exp_rep=df,
            empty_encoding=empty_encoding,
        )



[docs]
    @classmethod
    def from_dataframe(
        cls,
        df: pd.DataFrame,
        parameters: Sequence[DiscreteParameter] | None = None,
        empty_encoding: bool = False,
    ) -> SubspaceDiscrete:
        """Create a discrete subspace with a specified set of configurations.

        Args:
            df: The experimental representation of the search space to be created.
            parameters: Optional parameter objects corresponding to the columns in the
                given dataframe that can be provided to explicitly control parameter
                attributes. If a match between column name and parameter name is found,
                the corresponding parameter object is used. If a column has no match in
                the parameter list, a
                :class:`baybe.parameters.numerical.NumericalDiscreteParameter` is
                created if possible, or a
                :class:`baybe.parameters.categorical.CategoricalParameter` is used as
                fallback. For both types, default values are used for their optional
                arguments. For more details, see
                :func:`baybe.parameters.utils.get_parameters_from_dataframe`.
            empty_encoding: See :func:`baybe.searchspace.core.SearchSpace.from_product`.

        Returns:
            The created discrete subspace.
        """

        def discrete_parameter_factory(
            name: str, values: Collection[Any]
        ) -> DiscreteParameter:
            """Try to create a numerical parameter or use a categorical fallback."""
            try:
                return NumericalDiscreteParameter(name=name, values=values)
            except IterableValidationError:
                return CategoricalParameter(name=name, values=values)

        # Get the full list of both explicitly and implicitly defined parameter
        parameters = get_parameters_from_dataframe(
            df, discrete_parameter_factory, parameters
        )

        return cls(parameters=parameters, exp_rep=df, empty_encoding=empty_encoding)



[docs]
    @classmethod
    def from_simplex(
        cls,
        max_sum: float,
        simplex_parameters: Sequence[NumericalDiscreteParameter],
        product_parameters: Sequence[DiscreteParameter] | None = None,
        constraints: Sequence[DiscreteConstraint] | None = None,
        min_nonzero: int = 0,
        max_nonzero: int | None = None,
        boundary_only: bool = False,
        tolerance: float = 1e-6,
    ) -> SubspaceDiscrete:
        """Efficiently create discrete simplex subspaces.

        The same result can be achieved using
        :meth:`baybe.searchspace.discrete.SubspaceDiscrete.from_product` in combination
        with appropriate constraints. However, such an approach is inefficient
        because the Cartesian product involved creates an exponentially large set of
        candidates, most of which do not satisfy the simplex constraints and must be
        subsequently be filtered out by the method.

        By contrast, this method uses a shortcut that removes invalid candidates
        already during the creation of parameter combinations, resulting in a
        significantly faster construction.

        Args:
            max_sum: The maximum sum of the parameter values defining the simplex size.
            simplex_parameters: The parameters to be used for the simplex construction.
            product_parameters: Optional parameters that enter in form of a Cartesian
                product.
            constraints: See :class:`baybe.searchspace.core.SearchSpace`.
            min_nonzero: Optional restriction on the minimum number of nonzero
                parameter values in the simplex construction.
            max_nonzero: Optional restriction on the maximum number of nonzero
                parameter values in the simplex construction.
            boundary_only: Flag determining whether to keep only parameter
                configurations on the simplex boundary.
            tolerance: Numerical tolerance used to validate the simplex constraint.

        Raises:
            ValueError: If the passed simplex parameters are not suitable for a simplex
                construction.
            ValueError: If the passed product parameters are not discrete.
            ValueError: If the passed simplex parameters and product parameters are
                not disjoint.

        Returns:
            The created simplex subspace.

        Note:
            The achieved efficiency gains can vary depending on the particular order in
            which the parameters are passed to this method, as the configuration space
            is built up incrementally from the parameter sequence.
        """
        # Resolve defaults
        if product_parameters is None:
            product_parameters = []
        if constraints is None:
            constraints = []
        if max_nonzero is None:
            max_nonzero = len(simplex_parameters)

        # Validate constraints
        validate_constraints(constraints, [*simplex_parameters, *product_parameters])

        # Validate parameter types
        if not (
            all(isinstance(p, NumericalDiscreteParameter) for p in simplex_parameters)
        ):
            raise ValueError(
                f"All parameters passed via 'simplex_parameters' "
                f"must be of type '{NumericalDiscreteParameter.__name__}'."
            )
        if not all(p.is_discrete for p in product_parameters):
            raise ValueError(
                f"All parameters passed via 'product_parameters' "
                f"must be of subclasses of '{DiscreteParameter.__name__}'."
            )

        # Validate no overlap between simplex parameters and product parameters
        simplex_parameters_names = {p.name for p in simplex_parameters}
        product_parameters_names = {p.name for p in product_parameters}
        if overlap := simplex_parameters_names.intersection(product_parameters_names):
            raise ValueError(
                f"Parameter sets passed via 'simplex_parameters' and "
                f"'product_parameters' must be disjoint but share the following "
                f"parameters: {overlap}."
            )

        # Construct the product part of the space
        product_space = parameter_cartesian_prod_pandas(product_parameters)
        if not simplex_parameters:
            return cls(parameters=product_parameters, exp_rep=product_space)

        # Validate non-negativity
        min_values = [min(p.values) for p in simplex_parameters]
        max_values = [max(p.values) for p in simplex_parameters]
        if not (min(min_values) >= 0.0):
            raise ValueError(
                f"All simplex_parameters passed to '{cls.from_simplex.__name__}' "
                f"must have non-negative values only."
            )

        def drop_invalid(
            df: pd.DataFrame,
            max_sum: float,
            boundary_only: bool,
            min_nonzero: int | None = None,
            max_nonzero: int | None = None,
        ) -> None:
            """Drop rows that violate the specified simplex constraint.

            Args:
                df: The dataframe whose rows should satisfy the simplex constraint.
                max_sum: The maximum row sum defining the simplex size.
                boundary_only: Flag to control if the points represented by the rows
                    may lie inside the simplex or on its boundary only.
                min_nonzero: Minimum number of nonzero parameters required per row.
                max_nonzero: Maximum number of nonzero parameters allowed per row.
            """
            # Apply sum constraints
            row_sums = df.sum(axis=1)
            mask_violated = row_sums > max_sum + tolerance
            if boundary_only:
                mask_violated |= row_sums < max_sum - tolerance

            # Apply optional nonzero constraints
            if (min_nonzero is not None) or (max_nonzero is not None):
                n_nonzero = (df != 0.0).sum(axis=1)
                if min_nonzero is not None:
                    mask_violated |= n_nonzero < min_nonzero
                if max_nonzero is not None:
                    mask_violated |= n_nonzero > max_nonzero

            # Remove violating rows
            idxs_to_drop = df[mask_violated].index
            df.drop(index=idxs_to_drop, inplace=True)

        # Get the minimum sum contributions to come in the upcoming joins (the
        # first item is the minimum possible sum of all parameters starting from the
        # second parameter, the second item is the minimum possible sum starting from
        # the third parameter, and so on ...)
        min_sum_upcoming = np.cumsum(min_values[:0:-1])[::-1]

        # Get the min/max number of nonzero values to come in the upcoming joins (the
        # first item is the min/max number of nonzero parameters starting from the
        # second parameter, the second item is the min/max number starting from
        # the third parameter, and so on ...)
        min_nonzero_upcoming = np.cumsum((np.asarray(min_values) > 0.0)[:0:-1])[::-1]
        max_nonzero_upcoming = np.cumsum((np.asarray(max_values) > 0.0)[:0:-1])[::-1]

        # Incrementally build up the space, dropping invalid configuration along the
        # way. More specifically:
        # * After having cross-joined a new parameter, there must
        #   be enough "room" left for the remaining parameters to fit. That is,
        #   configurations of the current parameter subset that exceed the desired
        #   total value minus the minimum contribution to come from the yet-to-be-added
        #   parameters can be already discarded, because it is already clear that
        #   the total sum will be exceeded once all joins are completed.
        # * Analogously, there must be enough "nonzero slots" left for the yet to be
        #   joined parameters, i.e. parameter subset configurations can be discarded
        #   where the number of nonzero parameters already exceeds the maximum number
        #   of nonzeros minus the number of nonzeros to come, because it is already
        #   clear that the maximum will be exceeded once all joins are completed.
        # * Similarly, it can be verified for each parameter that there are still
        #   enough nonzero parameters to come to even reach the minimum
        #   desired number of nonzero after all joins.
        for i, (
            param,
            min_sum_to_go,
            min_nonzero_to_go,
            max_nonzero_to_go,
        ) in enumerate(
            zip(
                simplex_parameters,
                np.append(min_sum_upcoming, 0),
                np.append(min_nonzero_upcoming, 0),
                np.append(max_nonzero_upcoming, 0),
            )
        ):
            if i == 0:
                exp_rep = pd.DataFrame({param.name: param.values})
            else:
                exp_rep = pd.merge(
                    exp_rep, pd.DataFrame({param.name: param.values}), how="cross"
                )
            drop_invalid(
                exp_rep,
                max_sum=max_sum - min_sum_to_go,
                # the maximum possible number of nonzeros to come dictates if we
                # can achieve our minimum constraint in the end:
                min_nonzero=min_nonzero - max_nonzero_to_go,
                # the minimum possible number of nonzeros to come dictates if we
                # can stay below the targeted maximum in the end:
                max_nonzero=max_nonzero - min_nonzero_to_go,
                boundary_only=False,
            )

        # If requested, keep only the boundary values
        if boundary_only:
            drop_invalid(exp_rep, max_sum, boundary_only=True)

        # Augment the Cartesian product created from all other parameter types
        if product_parameters:
            exp_rep = pd.merge(exp_rep, product_space, how="cross")

        # Remove entries that violate parameter constraints:
        _apply_constraint_filter_pandas(exp_rep, constraints)

        return cls(
            parameters=[*simplex_parameters, *product_parameters],
            exp_rep=exp_rep,
            constraints=constraints,
        )


    @property
    def is_empty(self) -> bool:
        """Return whether this subspace is empty."""
        return len(self.parameters) == 0

    @property
    def param_bounds_comp(self) -> np.ndarray:
        """Return bounds as tensor.

        Take bounds from the parameter definitions, but discards bounds belonging to
        columns that were filtered out during the creation of the space.
        """
        if not self.parameters:
            return np.empty((2, 0))
        bounds = np.hstack(
            [
                np.vstack([p.comp_df[col].min(), p.comp_df[col].max()])
                for p in self.parameters
                for col in p.comp_df
                if col in self.comp_rep.columns
            ]
        )
        return bounds


[docs]
    @staticmethod
    def estimate_product_space_size(
        parameters: Sequence[DiscreteParameter],
    ) -> MemorySize:
        """Estimate an upper bound for the memory size of a product space.

        Args:
            parameters: The parameters spanning the product space.

        Returns:
            The estimated memory size.
        """
        # Compute the dataframe shapes
        n_cols_exp = len(parameters)
        n_cols_comp = sum(p.comp_df.shape[1] for p in parameters)
        n_rows = prod(p.comp_df.shape[0] for p in parameters)

        # Comp rep space is estimated as the size of float times the number of matrix
        # elements in the comp rep. The latter is the total number of parameter
        # configurations (= number of rows) times the total number of columns.
        comp_rep_bytes = (
            np.array([0.0], dtype=DTypeFloatNumpy).itemsize * n_rows * n_cols_comp
        )

        # Exp rep space is estimated as the size of the per-parameter exp rep dataframe
        # times the number of times it will appear in the entire search space. The
        # latter is the total number of parameter configurations (= number of rows)
        # divided by the number of values for the respective parameter. Contributions of
        # all parameters are summed up.
        exp_rep_bytes = sum(
            pd.DataFrame(p.values).memory_usage(index=False, deep=True).sum()
            * n_rows
            / p.comp_df.shape[0]
            for p in parameters
        )

        return MemorySize(
            exp_rep_bytes=exp_rep_bytes,
            exp_rep_shape=(n_rows, n_cols_exp),
            comp_rep_bytes=comp_rep_bytes,
            comp_rep_shape=(n_rows, n_cols_comp),
        )



[docs]
    def mark_as_measured(
        self,
        measurements: pd.DataFrame,
        numerical_measurements_must_be_within_tolerance: bool,
    ) -> None:
        """Mark the given elements of the space as measured.

        Args:
            measurements: A dataframe containing parameter settings that should be
                marked as measured.
            numerical_measurements_must_be_within_tolerance: See
                :func:`baybe.utils.dataframe.fuzzy_row_match`.
        """
        idxs_matched = fuzzy_row_match(
            self.exp_rep,
            measurements,
            self.parameters,
            numerical_measurements_must_be_within_tolerance,
        )
        self.metadata.loc[idxs_matched, "was_measured"] = True



[docs]
    def get_candidates(
        self,
        allow_repeated_recommendations: bool = False,
        allow_recommending_already_measured: bool = False,
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Return the set of candidate parameter settings that can be tested.

        Args:
            allow_repeated_recommendations: If ``True``, parameter settings that have
                already been recommended in an earlier iteration are still considered
                valid candidates. This is relevant, for instance, when an earlier
                recommended parameter setting has not been measured by the user (for any
                reason) after the corresponding recommendation was made.
            allow_recommending_already_measured: If ``True``, parameters settings for
                which there are already target values available are still considered as
                valid candidates.

        Returns:
            The candidate parameter settings both in experimental and computational
            representation.
        """
        # Filter the search space down to the candidates
        mask_todrop = self.metadata["dont_recommend"].copy()
        if not allow_repeated_recommendations:
            mask_todrop |= self.metadata["was_recommended"]
        if not allow_recommending_already_measured:
            mask_todrop |= self.metadata["was_measured"]

        return self.exp_rep.loc[~mask_todrop], self.comp_rep.loc[~mask_todrop]



[docs]
    def transform(
        self,
        df: pd.DataFrame | None = None,
        /,
        *,
        allow_missing: bool = False,
        allow_extra: bool | None = None,
        data: pd.DataFrame | None = None,
    ) -> pd.DataFrame:
        """See :func:`baybe.searchspace.core.SearchSpace.transform`."""
        # >>>>>>>>>> Deprecation
        if not ((df is None) ^ (data is None)):
            raise ValueError(
                "Provide the dataframe to be transformed as argument to `df`."
            )

        if data is not None:
            df = data
            warnings.warn(
                "Providing the dataframe via the `data` argument is deprecated and "
                "will be removed in a future version. Please pass your dataframe "
                "as positional argument instead.",
                DeprecationWarning,
            )

        # Mypy does not infer from the above that `df` must be a dataframe here
        assert isinstance(df, pd.DataFrame)

        if allow_extra is None:
            allow_extra = True
            if set(df.columns) - {p.name for p in self.parameters}:
                warnings.warn(
                    "For backward compatibility, the new `allow_extra` flag is set "
                    "to `True` when left unspecified. However, this behavior will be "
                    "changed in a future version. If you want to invoke the old "
                    "behavior, please explicitly set `allow_extra=True`.",
                    DeprecationWarning,
                )
        # <<<<<<<<<< Deprecation

        # Extract the parameters to be transformed
        parameters = get_transform_parameters(
            self.parameters, df, allow_missing, allow_extra
        )

        # If the transformed values are not required, return an empty dataframe
        if self.empty_encoding or len(df) < 1:
            return pd.DataFrame(index=df.index)

        # Transform the parameters
        dfs = []
        for param in parameters:
            comp_df = param.transform(df[param.name])
            dfs.append(comp_df)
        comp_rep = pd.concat(dfs, axis=1) if dfs else pd.DataFrame()

        # If the computational representation has already been built (with potentially
        # removing some columns, e.g. due to decorrelation or dropping constant ones),
        # any subsequent transformation should yield the same columns.
        try:
            return comp_rep[self.comp_rep.columns]
        except AttributeError:
            return comp_rep



[docs]
    def get_parameters_by_name(
        self, names: Sequence[str]
    ) -> tuple[DiscreteParameter, ...]:
        """Return parameters with the specified names.

        Args:
            names: Sequence of parameter names.

        Returns:
            The named parameters.
        """
        return tuple(p for p in self.parameters if p.name in names)




def _apply_constraint_filter_pandas(
    df: pd.DataFrame, constraints: Collection[DiscreteConstraint]
) -> pd.DataFrame:
    """Remove discrete search space entries based on constraints.

    The filtering is done inplace, but the modified object is still returned.

    Args:
        df: The data in experimental representation to be modified inplace.
        constraints: List of discrete constraints.

    Returns:
        The filtered dataframe.
    """
    # Remove entries that violate parameter constraints:
    for constraint in (c for c in constraints if c.eval_during_creation):
        idxs = constraint.get_invalid(df)
        df.drop(index=idxs, inplace=True)
    df.reset_index(inplace=True, drop=True)

    return df


def _apply_constraint_filter_polars(
    ldf: pl.LazyFrame,
    constraints: Sequence[DiscreteConstraint],
) -> tuple[pl.LazyFrame, list[bool]]:
    """Remove discrete search space entries based on constraints.

    Note:
        This will silently skip constraints that have no Polars implementation.

    Args:
        ldf: The data in experimental representation to be filtered.
        constraints: Collection of discrete constraints.

    Returns:
        A tuple containing
            * The Polars lazyframe with undesired rows removed
            * A Boolean mask indicating which constraints have **not** been applied
    """
    mask_missing = []

    for c in constraints:
        try:
            to_keep = c.get_invalid_polars().not_()
            ldf = ldf.filter(to_keep)
            mask_missing.append(False)
        except NotImplementedError:
            mask_missing.append(True)

    return ldf, mask_missing



[docs]
def parameter_cartesian_prod_polars(parameters: Sequence[Parameter]) -> pl.LazyFrame:
    """Create the Cartesian product of all parameter values using Polars.

    Ignores continuous parameters.

    Args:
        parameters: List of parameter objects.

    Returns:
        A lazy dataframe containing all possible discrete parameter value combinations.
    """
    from baybe._optional.polars import polars as pl

    discrete_parameters = [p for p in parameters if p.is_discrete]
    if not discrete_parameters:
        return pl.LazyFrame()

    # Convert each parameter to a lazy dataframe for cross-join operation
    param_frames = [pl.LazyFrame({p.name: p.values}) for p in discrete_parameters]  # type:ignore[attr-defined]

    # Handling edge cases
    if len(param_frames) == 1:
        return param_frames[0]

    # Cross-join parameters
    res = param_frames[0]
    for frame in param_frames[1:]:
        res = res.join(frame, how="cross", force_parallel=True)

    return res




[docs]
def parameter_cartesian_prod_pandas(
    parameters: Sequence[Parameter],
) -> pd.DataFrame:
    """Create the Cartesian product of all parameter values using Pandas.

    Ignores continuous parameters.

    Args:
        parameters: List of parameter objects.

    Returns:
        A dataframe containing all possible discrete parameter value combinations.
    """
    discrete_parameters = [p for p in parameters if p.is_discrete]
    if not discrete_parameters:
        return pd.DataFrame()

    index = pd.MultiIndex.from_product(
        [p.values for p in discrete_parameters],  # type:ignore[attr-defined]
        names=[p.name for p in discrete_parameters],
    )
    ret = pd.DataFrame(index=index).reset_index()

    return ret




[docs]
def validate_simplex_subspace_from_config(specs: dict, _) -> None:
    """Validate the discrete space while skipping costly creation steps."""
    # Validate product inputs without constructing it
    if specs.get("constructor", None) == "from_product":
        parameters = converter.structure(specs["parameters"], list[DiscreteParameter])
        validate_parameters(parameters)

        constraints = specs.get("constraints", None)
        if constraints:
            constraints = converter.structure(
                specs["constraints"], list[DiscreteConstraint]
            )
            validate_constraints(constraints, parameters)

    # Validate simplex inputs without constructing it
    elif specs.get("constructor", None) == "from_simplex":
        simplex_parameters = converter.structure(
            specs["simplex_parameters"], list[NumericalDiscreteParameter]
        )

        if not all(min(p.values) >= 0.0 for p in simplex_parameters):
            raise ValueError(
                f"All simplex_parameters passed to "
                f"'{SubspaceDiscrete.from_simplex.__name__}' must have non-negative "
                f"values only."
            )

        product_parameters = specs.get("product_parameters", None)
        if product_parameters:
            product_parameters = converter.structure(
                specs["product_parameters"], list[DiscreteParameter]
            )

        validate_parameters(simplex_parameters + product_parameters)

        constraints = specs.get("constraints", None)
        if constraints:
            constraints = converter.structure(
                specs["constraints"], list[DiscreteConstraint]
            )
            validate_constraints(constraints, simplex_parameters + product_parameters)

    # For all other types, validate by construction
    else:
        converter.structure(specs, SubspaceDiscrete)



# Register deserialization hook
converter.register_structure_hook(SubspaceDiscrete, select_constructor_hook)