Source code for baybe.recommenders.pure.nonpredictive.clustering

"""Recommenders based on clustering."""

from abc import ABC, abstractmethod
from typing import ClassVar

import numpy as np
import pandas as pd
from attrs import define, field
from scipy.stats import multivariate_normal
from sklearn.base import ClusterMixin
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler

from baybe.recommenders.pure.nonpredictive.base import NonPredictiveRecommender
from baybe.searchspace import SearchSpaceType, SubspaceDiscrete



[docs]
@define
class SKLearnClusteringRecommender(NonPredictiveRecommender, ABC):
    """Intermediate class for cluster-based selection of discrete candidates.

    Suitable for ``sklearn``-like models that have a ``fit`` and ``predict``
    method. Specific model parameters and cluster sub-selection techniques can be
    declared in the derived classes.
    """

    # Class variables
    compatibility: ClassVar[SearchSpaceType] = SearchSpaceType.DISCRETE
    # See base class.

    # TODO: `use_custom_selector` can probably be replaced with a fallback mechanism
    #   that checks if a custom mechanism is implemented and uses default otherwise
    #   (similar to what is done in the recommenders)

    model_cluster_num_parameter_name: ClassVar[str]
    """Class variable describing the name of the clustering parameter."""

    _use_custom_selector: ClassVar[bool] = False
    """Class variable flagging whether a custom selector is being used."""

    # Object variables
    model_params: dict = field(factory=dict)
    """Optional model parameter that will be passed to the surrogate constructor.
    This is initialized with reasonable default values for the derived child classes."""

    @staticmethod
    @abstractmethod
    def _get_model_cls() -> type[ClusterMixin]:
        """Return the surrogate model class."""

    def _make_selection_default(
        self,
        model: ClusterMixin,
        candidates_scaled: pd.DataFrame | np.ndarray,
    ) -> list[int]:
        """Select one candidate from each cluster uniformly at random.

        This function is model-agnostic and can be used by any child class.

        Args:
            model: The used model.
            candidates_scaled: The already scaled candidates.

        Returns:
            A list with positional indices of the selected candidates.
        """
        assigned_clusters = model.predict(candidates_scaled)
        selection = [
            np.random.choice(np.argwhere(cluster == assigned_clusters).flatten())
            for cluster in np.unique(assigned_clusters)
        ]
        return selection

    def _make_selection_custom(
        self,
        model: ClusterMixin,
        candidates_scaled: pd.DataFrame | np.ndarray,
    ) -> list[int]:
        """Select candidates from the computed clustering.

        This function is model-specific and may be implemented by the derived class.

        Args:
            model: The used model.
            candidates_scaled: The already scaled candidates.

        Returns:
            A list with positional indices of the selected candidates.

        Raises:
            NotImplementedError: If this function is not implemented. Should be
                unreachable.
        """
        raise NotImplementedError("This line in the code should be unreachable. Sry.")

    def _recommend_discrete(
        self,
        subspace_discrete: SubspaceDiscrete,
        candidates_comp: pd.DataFrame,
        batch_size: int,
    ) -> pd.Index:
        # See base class.

        # Fit scaler on entire search space
        # TODO [Scaling]: scaling should be handled by search space object
        scaler = StandardScaler()
        scaler.fit(subspace_discrete.comp_rep)

        candidates_scaled = np.ascontiguousarray(scaler.transform(candidates_comp))

        # Set model parameters and perform fit
        model = self._get_model_cls()(
            **{self.model_cluster_num_parameter_name: batch_size},
            **self.model_params,
        )
        model.fit(candidates_scaled)

        # Perform selection based on assigned clusters
        if self._use_custom_selector:
            selection = self._make_selection_custom(model, candidates_scaled)
        else:
            selection = self._make_selection_default(model, candidates_scaled)

        # Convert positional indices into DataFrame indices and return result
        return candidates_comp.index[selection]




[docs]
@define
class PAMClusteringRecommender(SKLearnClusteringRecommender):
    """Partitioning Around Medoids (PAM) clustering recommender."""

    model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
    # See base class.

    _use_custom_selector: ClassVar[bool] = True
    # See base class.

    # Object variables
    model_params: dict = field()
    # See base class.

    @model_params.default
    def _default_model_params(self) -> dict:
        """Create the default model parameters."""
        return {"max_iter": 100, "init": "k-medoids++"}

    @staticmethod
    def _get_model_cls() -> type[ClusterMixin]:
        # See base class.
        from sklearn_extra.cluster import KMedoids

        return KMedoids

    def _make_selection_custom(
        self,
        model: ClusterMixin,
        candidates_scaled: pd.DataFrame | np.ndarray,
    ) -> list[int]:
        """Select candidates from the computed clustering.

        In PAM, cluster centers (medoids) correspond to actual data points,
        which means they can be directly used for the selection.

        Args:
            model: The used model.
            candidates_scaled: The already scaled candidates. Unused.

        Returns:
            A list with positional indices of the selected candidates.
        """
        selection = model.medoid_indices_.tolist()
        return selection




[docs]
@define
class KMeansClusteringRecommender(SKLearnClusteringRecommender):
    """K-means clustering recommender."""

    # Class variables
    model_cluster_num_parameter_name: ClassVar[str] = "n_clusters"
    # See base class.

    _use_custom_selector: ClassVar[bool] = True
    # See base class.

    # Object variables
    model_params: dict = field()
    # See base class.

    @model_params.default
    def _default_model_params(self) -> dict:
        """Create the default model parameters."""
        return {"max_iter": 1000, "n_init": 50}

    @staticmethod
    def _get_model_cls() -> type[ClusterMixin]:
        # See base class.
        from sklearn.cluster import KMeans

        return KMeans

    def _make_selection_custom(
        self,
        model: ClusterMixin,
        candidates_scaled: pd.DataFrame | np.ndarray,
    ) -> list[int]:
        """Select candidates from the computed clustering.

        For K-means, a reasonable choice is to pick the points closest to each
        cluster center.

        Args:
            model: The used model.
            candidates_scaled: The already scaled candidates.

        Returns:
            A list with positional indices of the selected candidates.
        """
        distances = pairwise_distances(candidates_scaled, model.cluster_centers_)
        # Set the distances of points that were not assigned by the model to that
        # cluster to infinity. This assures that one unique point per cluster is
        # assigned.
        predicted_clusters = model.predict(candidates_scaled)
        for k_cluster in range(model.cluster_centers_.shape[0]):
            idxs = predicted_clusters != k_cluster
            distances[idxs, k_cluster] = np.inf
        selection = np.argmin(distances, axis=0).tolist()
        return selection




[docs]
@define
class GaussianMixtureClusteringRecommender(SKLearnClusteringRecommender):
    """Gaussian mixture model (GMM) clustering recommender."""

    # Class variables
    model_cluster_num_parameter_name: ClassVar[str] = "n_components"
    # See base class.

    @staticmethod
    def _get_model_cls() -> type[ClusterMixin]:
        # See base class.
        from sklearn.mixture import GaussianMixture

        return GaussianMixture

    def _make_selection_custom(
        self,
        model: ClusterMixin,
        candidates_scaled: pd.DataFrame | np.ndarray,
    ) -> list[int]:
        """Select candidates from the computed clustering.

        In a GMM, a reasonable choice is to pick the point with the highest
        probability densities for each cluster.

        Args:
            model: The used model.
            candidates_scaled: The already scaled candidates.

        Returns:
            A list with positional indices of the selected candidates.
        """
        predicted_clusters = model.predict(candidates_scaled)
        selection = []
        for k_cluster in range(model.n_components):
            density = multivariate_normal(
                cov=model.covariances_[k_cluster],
                mean=model.means_[k_cluster],
            ).logpdf(candidates_scaled)

            # For selecting a point from this cluster we only consider points that were
            # assigned to the current cluster by the model, hence set the density of
            # others to 0
            density[predicted_clusters != k_cluster] = 0.0

            selection.append(np.argmax(density).item())
        return selection