Source code for baybe.parameters.substance

"""Substance parameters."""

from functools import cached_property
from typing import Any, ClassVar

import pandas as pd
from attrs import define, field
from attrs.validators import and_, deep_mapping, instance_of, min_len

from baybe.parameters.base import DiscreteParameter
from baybe.parameters.enum import SubstanceEncoding
from baybe.parameters.validation import validate_decorrelation
from baybe.utils.basic import group_duplicate_values
from baybe.utils.dataframe import df_drop_single_value_columns, df_uncorrelated_features

try:  # For python < 3.11, use the exceptiongroup backport
    ExceptionGroup
except NameError:
    from exceptiongroup import ExceptionGroup


Smiles = str
"""Type alias for SMILES strings."""


[docs] @define(frozen=True, slots=False) class SubstanceParameter(DiscreteParameter): """Generic substances that are treated with cheminformatics descriptors. Only a decorrelated subset of descriptors should be used as otherwise this can result in a large number of features. For a handful of molecules, keeping only descriptors that have a maximum correlation of 0.7 reduces the number of descriptors to about 5-20. The number might be substantially higher with more labels given. """ # class variables is_numerical: ClassVar[bool] = False # See base class. # object variables data: dict[str, Smiles] = field( validator=deep_mapping( mapping_validator=min_len(2), # FIXME[typing]: https://github.com/python-attrs/attrs/issues/1206 key_validator=and_(instance_of(str), min_len(1)), value_validator=lambda *x: None, ) ) """A mapping that provides the SMILES strings for all available parameter values.""" decorrelate: bool | float = field(default=True, validator=validate_decorrelation) """Specifies the used decorrelation mode for the parameter encoding. - ``False``: The encoding is used as is. - ``True``: The encoding is decorrelated using a default correlation threshold. - float in (0, 1): The encoding is decorrelated using the specified threshold. """ encoding: SubstanceEncoding = field( default=SubstanceEncoding.MORDRED, converter=SubstanceEncoding ) # See base class. @data.validator def _validate_substance_data( # noqa: DOC101, DOC103 self, _: Any, data: dict[str, Smiles] ) -> None: """Validate that the substance data, provided as SMILES, is valid. Raises: ValueError: If one or more of the SMILES are invalid. ValueError: If the several entries represent the same substance. """ from baybe.utils import chemistry # Check for invalid SMILES canonical_smiles = {} exceptions = [] for name, smiles in data.items(): try: canonical_smiles[name] = chemistry.get_canonical_smiles(smiles) except ValueError: exceptions.append( ValueError( f"The SMILES '{smiles}' for molecule '{name}' does " f"not appear to be valid." ) ) if exceptions: raise ExceptionGroup("invalid SMILES", exceptions) # Check for duplicate substances if groups := group_duplicate_values(canonical_smiles): exceptions = [] for group, substances in groups.items(): group_data = {s: data[s] for s in substances} exceptions.append( ValueError( f"The following entries all represent the same substance " f"'{group}': {group_data}." ) ) raise ExceptionGroup("duplicate substances", exceptions) @property def values(self) -> tuple: """Returns the labels of the given set of molecules.""" return tuple(self.data.keys()) @cached_property def comp_df(self) -> pd.DataFrame: # noqa: D102 # See base class. from baybe.utils import chemistry vals = list(self.data.values()) pref = self.name + "_" # Get the raw descriptors if self.encoding is SubstanceEncoding.MORDRED: comp_df = chemistry.smiles_to_mordred_features(vals, prefix=pref) elif self.encoding is SubstanceEncoding.RDKIT: comp_df = chemistry.smiles_to_rdkit_features(vals, prefix=pref) elif self.encoding is SubstanceEncoding.MORGAN_FP: comp_df = chemistry.smiles_to_fp_features(vals, prefix=pref) else: raise ValueError( f"Unknown parameter encoding {self.encoding} for parameter {self.name}." ) # Drop NaN and constant columns comp_df = comp_df.loc[:, ~comp_df.isna().any(axis=0)] comp_df = df_drop_single_value_columns(comp_df) # If there are bool columns, convert them to int (possible for Mordred) bool_cols = comp_df.select_dtypes(bool).columns comp_df[bool_cols] = comp_df[bool_cols].astype(int) # Label the rows with the molecule names comp_df.index = pd.Index(self.values) # Get a decorrelated subset of the descriptors if self.decorrelate: if isinstance(self.decorrelate, bool): comp_df = df_uncorrelated_features(comp_df) else: comp_df = df_uncorrelated_features(comp_df, threshold=self.decorrelate) return comp_df