"""Substance parameters."""fromfunctoolsimportcached_propertyfromtypingimportAny,ClassVarimportpandasaspdfromattrsimportdefine,fieldfromattrs.validatorsimportand_,deep_mapping,instance_of,min_lenfrombaybe.parameters.baseimportDiscreteParameterfrombaybe.parameters.enumimportSubstanceEncodingfrombaybe.parameters.validationimportvalidate_decorrelationfrombaybe.utils.basicimportgroup_duplicate_valuesfrombaybe.utils.dataframeimportdf_drop_single_value_columns,df_uncorrelated_featurestry:# For python < 3.11, use the exceptiongroup backportExceptionGroupexceptNameError:fromexceptiongroupimportExceptionGroupSmiles=str"""Type alias for SMILES strings."""
[docs]@define(frozen=True,slots=False)classSubstanceParameter(DiscreteParameter):"""Generic substances that are treated with cheminformatics descriptors. Only a decorrelated subset of descriptors should be used as otherwise this can result in a large number of features. For a handful of molecules, keeping only descriptors that have a maximum correlation of 0.7 reduces the number of descriptors to about 5-20. The number might be substantially higher with more labels given. """# class variablesis_numerical:ClassVar[bool]=False# See base class.# object variablesdata:dict[str,Smiles]=field(validator=deep_mapping(mapping_validator=min_len(2),# FIXME[typing]: https://github.com/python-attrs/attrs/issues/1206key_validator=and_(instance_of(str),min_len(1)),value_validator=lambda*x:None,))"""A mapping that provides the SMILES strings for all available parameter values."""decorrelate:bool|float=field(default=True,validator=validate_decorrelation)"""Specifies the used decorrelation mode for the parameter encoding. - ``False``: The encoding is used as is. - ``True``: The encoding is decorrelated using a default correlation threshold. - float in (0, 1): The encoding is decorrelated using the specified threshold. """encoding:SubstanceEncoding=field(default=SubstanceEncoding.MORDRED,converter=SubstanceEncoding)# See base class.@data.validatordef_validate_substance_data(# noqa: DOC101, DOC103self,_:Any,data:dict[str,Smiles])->None:"""Validate that the substance data, provided as SMILES, is valid. Raises: ValueError: If one or more of the SMILES are invalid. ValueError: If the several entries represent the same substance. """frombaybe.utilsimportchemistry# Check for invalid SMILEScanonical_smiles={}exceptions=[]forname,smilesindata.items():try:canonical_smiles[name]=chemistry.get_canonical_smiles(smiles)exceptValueError:exceptions.append(ValueError(f"The SMILES '{smiles}' for molecule '{name}' does "f"not appear to be valid."))ifexceptions:raiseExceptionGroup("invalid SMILES",exceptions)# Check for duplicate substancesifgroups:=group_duplicate_values(canonical_smiles):exceptions=[]forgroup,substancesingroups.items():group_data={s:data[s]forsinsubstances}exceptions.append(ValueError(f"The following entries all represent the same substance "f"'{group}': {group_data}."))raiseExceptionGroup("duplicate substances",exceptions)@propertydefvalues(self)->tuple:"""Returns the labels of the given set of molecules."""returntuple(self.data.keys())@cached_propertydefcomp_df(self)->pd.DataFrame:# noqa: D102# See base class.frombaybe.utilsimportchemistryvals=list(self.data.values())pref=self.name+"_"# Get the raw descriptorsifself.encodingisSubstanceEncoding.MORDRED:comp_df=chemistry.smiles_to_mordred_features(vals,prefix=pref)elifself.encodingisSubstanceEncoding.RDKIT:comp_df=chemistry.smiles_to_rdkit_features(vals,prefix=pref)elifself.encodingisSubstanceEncoding.MORGAN_FP:comp_df=chemistry.smiles_to_fp_features(vals,prefix=pref)else:raiseValueError(f"Unknown parameter encoding {self.encoding} for parameter {self.name}.")# Drop NaN and constant columnscomp_df=comp_df.loc[:,~comp_df.isna().any(axis=0)]comp_df=df_drop_single_value_columns(comp_df)# If there are bool columns, convert them to int (possible for Mordred)bool_cols=comp_df.select_dtypes(bool).columnscomp_df[bool_cols]=comp_df[bool_cols].astype(int)# Label the rows with the molecule namescomp_df.index=pd.Index(self.values)# Get a decorrelated subset of the descriptorsifself.decorrelate:ifisinstance(self.decorrelate,bool):comp_df=df_uncorrelated_features(comp_df)else:comp_df=df_uncorrelated_features(comp_df,threshold=self.decorrelate)returncomp_df