Source code for baybe.recommenders.pure.nonpredictive.clustering
"""Recommenders based on clustering."""fromabcimportABC,abstractmethodfromtypingimportClassVarimportnumpyasnpimportpandasaspdfromattrsimportdefine,fieldfromscipy.statsimportmultivariate_normalfromsklearn.baseimportClusterMixinfromsklearn.metricsimportpairwise_distancesfromsklearn.preprocessingimportStandardScalerfrombaybe.recommenders.pure.nonpredictive.baseimportNonPredictiveRecommenderfrombaybe.searchspaceimportSearchSpaceType,SubspaceDiscrete
[docs]@defineclassSKLearnClusteringRecommender(NonPredictiveRecommender,ABC):"""Intermediate class for cluster-based selection of discrete candidates. Suitable for ``sklearn``-like models that have a ``fit`` and ``predict`` method. Specific model parameters and cluster sub-selection techniques can be declared in the derived classes. """# Class variablescompatibility:ClassVar[SearchSpaceType]=SearchSpaceType.DISCRETE# See base class.# TODO: `use_custom_selector` can probably be replaced with a fallback mechanism# that checks if a custom mechanism is implemented and uses default otherwise# (similar to what is done in the recommenders)model_cluster_num_parameter_name:ClassVar[str]"""Class variable describing the name of the clustering parameter."""_use_custom_selector:ClassVar[bool]=False"""Class variable flagging whether a custom selector is being used."""# Object variablesmodel_params:dict=field(factory=dict)"""Optional model parameter that will be passed to the surrogate constructor. This is initialized with reasonable default values for the derived child classes."""@staticmethod@abstractmethoddef_get_model_cls()->type[ClusterMixin]:"""Return the surrogate model class."""def_make_selection_default(self,model:ClusterMixin,candidates_scaled:pd.DataFrame|np.ndarray,)->list[int]:"""Select one candidate from each cluster uniformly at random. This function is model-agnostic and can be used by any child class. Args: model: The used model. candidates_scaled: The already scaled candidates. Returns: A list with positional indices of the selected candidates. """assigned_clusters=model.predict(candidates_scaled)selection=[np.random.choice(np.argwhere(cluster==assigned_clusters).flatten())forclusterinnp.unique(assigned_clusters)]returnselectiondef_make_selection_custom(self,model:ClusterMixin,candidates_scaled:pd.DataFrame|np.ndarray,)->list[int]:"""Select candidates from the computed clustering. This function is model-specific and may be implemented by the derived class. Args: model: The used model. candidates_scaled: The already scaled candidates. Returns: A list with positional indices of the selected candidates. Raises: NotImplementedError: If this function is not implemented. Should be unreachable. """raiseNotImplementedError("This line in the code should be unreachable. Sry.")def_recommend_discrete(self,subspace_discrete:SubspaceDiscrete,candidates_comp:pd.DataFrame,batch_size:int,)->pd.Index:# See base class.# Fit scaler on entire search space# TODO [Scaling]: scaling should be handled by search space objectscaler=StandardScaler()scaler.fit(subspace_discrete.comp_rep)candidates_scaled=np.ascontiguousarray(scaler.transform(candidates_comp))# Set model parameters and perform fitmodel=self._get_model_cls()(**{self.model_cluster_num_parameter_name:batch_size},**self.model_params,)model.fit(candidates_scaled)# Perform selection based on assigned clustersifself._use_custom_selector:selection=self._make_selection_custom(model,candidates_scaled)else:selection=self._make_selection_default(model,candidates_scaled)# Convert positional indices into DataFrame indices and return resultreturncandidates_comp.index[selection]
[docs]@defineclassPAMClusteringRecommender(SKLearnClusteringRecommender):"""Partitioning Around Medoids (PAM) clustering recommender."""model_cluster_num_parameter_name:ClassVar[str]="n_clusters"# See base class._use_custom_selector:ClassVar[bool]=True# See base class.# Object variablesmodel_params:dict=field()# See base class.@model_params.defaultdef_default_model_params(self)->dict:"""Create the default model parameters."""return{"max_iter":100,"init":"k-medoids++"}@staticmethoddef_get_model_cls()->type[ClusterMixin]:# See base class.fromsklearn_extra.clusterimportKMedoidsreturnKMedoidsdef_make_selection_custom(self,model:ClusterMixin,candidates_scaled:pd.DataFrame|np.ndarray,)->list[int]:"""Select candidates from the computed clustering. In PAM, cluster centers (medoids) correspond to actual data points, which means they can be directly used for the selection. Args: model: The used model. candidates_scaled: The already scaled candidates. Unused. Returns: A list with positional indices of the selected candidates. """selection=model.medoid_indices_.tolist()returnselection
[docs]@defineclassKMeansClusteringRecommender(SKLearnClusteringRecommender):"""K-means clustering recommender."""# Class variablesmodel_cluster_num_parameter_name:ClassVar[str]="n_clusters"# See base class._use_custom_selector:ClassVar[bool]=True# See base class.# Object variablesmodel_params:dict=field()# See base class.@model_params.defaultdef_default_model_params(self)->dict:"""Create the default model parameters."""return{"max_iter":1000,"n_init":50}@staticmethoddef_get_model_cls()->type[ClusterMixin]:# See base class.fromsklearn.clusterimportKMeansreturnKMeansdef_make_selection_custom(self,model:ClusterMixin,candidates_scaled:pd.DataFrame|np.ndarray,)->list[int]:"""Select candidates from the computed clustering. For K-means, a reasonable choice is to pick the points closest to each cluster center. Args: model: The used model. candidates_scaled: The already scaled candidates. Returns: A list with positional indices of the selected candidates. """distances=pairwise_distances(candidates_scaled,model.cluster_centers_)# Set the distances of points that were not assigned by the model to that# cluster to infinity. This assures that one unique point per cluster is# assigned.predicted_clusters=model.predict(candidates_scaled)fork_clusterinrange(model.cluster_centers_.shape[0]):idxs=predicted_clusters!=k_clusterdistances[idxs,k_cluster]=np.infselection=np.argmin(distances,axis=0).tolist()returnselection
[docs]@defineclassGaussianMixtureClusteringRecommender(SKLearnClusteringRecommender):"""Gaussian mixture model (GMM) clustering recommender."""# Class variablesmodel_cluster_num_parameter_name:ClassVar[str]="n_components"# See base class.@staticmethoddef_get_model_cls()->type[ClusterMixin]:# See base class.fromsklearn.mixtureimportGaussianMixturereturnGaussianMixturedef_make_selection_custom(self,model:ClusterMixin,candidates_scaled:pd.DataFrame|np.ndarray,)->list[int]:"""Select candidates from the computed clustering. In a GMM, a reasonable choice is to pick the point with the highest probability densities for each cluster. Args: model: The used model. candidates_scaled: The already scaled candidates. Returns: A list with positional indices of the selected candidates. """predicted_clusters=model.predict(candidates_scaled)selection=[]fork_clusterinrange(model.n_components):density=multivariate_normal(cov=model.covariances_[k_cluster],mean=model.means_[k_cluster],).logpdf(candidates_scaled)# For selecting a point from this cluster we only consider points that were# assigned to the current cluster by the model, hence set the density of# others to 0density[predicted_clusters!=k_cluster]=0.0selection.append(np.argmax(density).item())returnselection