Source code for baybe.utils.chemistry
"""Chemistry tools."""
import ssl
import urllib.request
import warnings
from collections.abc import Sequence
from functools import lru_cache
import numpy as np
import pandas as pd
from baybe._optional.chem import (
BaseFingerprintTransformer,
Chem,
ConformerGenerator,
MolFromSmilesTransformer,
fingerprints,
)
from baybe.parameters.enum import SubstanceEncoding
from baybe.settings import active_settings
from baybe.utils.basic import cache_to_disk
[docs]
def name_to_smiles(name: str) -> str:
"""Convert from chemical name to SMILES string using chemical identifier resolver.
This script is useful to combine with ``df.apply`` from pandas, hence it does not
throw exceptions for invalid molecules but instead returns an empty string for
easy subsequent postprocessing of the dataframe.
Args:
name: Name or nickname of compound.
Returns:
SMILES string corresponding to chemical name.
"""
name = name.replace(" ", "%20")
try:
url = "http://cactus.nci.nih.gov/chemical/structure/" + name + "/smiles"
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
with urllib.request.urlopen(url, context=ctx) as web:
smiles = web.read().decode("utf8")
smiles = str(smiles)
if "</div>" in smiles:
return ""
return smiles
except Exception:
return ""
@lru_cache(maxsize=None)
@cache_to_disk
def _molecule_to_fingerprint_features(
molecule: str | Chem.Mol,
encoder: BaseFingerprintTransformer,
) -> np.ndarray:
"""Compute molecular fingerprint for a single molecule.
Args:
molecule: SMILES string or molecule object.
encoder: Instance of the fingerprint class to be used for computation.
Returns:
Array of fingerprint features.
"""
return encoder.transform([molecule])
[docs]
def smiles_to_fingerprint_features(
smiles: Sequence[str],
encoding: SubstanceEncoding,
prefix: str | None = None,
kwargs_conformer: dict | None = None,
kwargs_fingerprint: dict | None = None,
) -> pd.DataFrame:
"""Compute molecular fingerprints for a list of SMILES strings.
Args:
smiles: Sequence of SMILES strings.
encoding: Encoding used to transform SMILES to fingerprints.
prefix: Name prefix for each descriptor (e.g., nBase --> <prefix>_nBase).
kwargs_conformer: kwargs for conformer generator
kwargs_fingerprint: kwargs for fingerprint generator
Returns:
Dataframe containing fingerprints for each SMILES string.
"""
kwargs_fingerprint = kwargs_fingerprint or {}
kwargs_conformer = kwargs_conformer or {}
if encoding is SubstanceEncoding.MORGAN_FP:
warnings.warn(
f"Substance encoding '{encoding.name}' is deprecated and will be disabled "
f"in a future version. Use '{SubstanceEncoding.ECFP.name}' "
f"with 'fp_size' 1024 and 'radius' 4 instead.",
DeprecationWarning,
)
encoding = SubstanceEncoding.ECFP
kwargs_fingerprint.update({"fp_size": 1024, "radius": 4})
elif encoding is SubstanceEncoding.RDKIT:
warnings.warn(
f"Substance encoding '{encoding.name}' is deprecated and will be disabled "
f"in a future version. Use '{SubstanceEncoding.RDKIT2DDESCRIPTORS.name}' "
f"instead.",
DeprecationWarning,
)
encoding = SubstanceEncoding.RDKIT2DDESCRIPTORS
fingerprint_cls = get_fingerprint_class(encoding)
fingerprint_encoder = fingerprint_cls(**kwargs_fingerprint)
if fingerprint_encoder.requires_conformers:
mol_list = ConformerGenerator(**kwargs_conformer).transform(
MolFromSmilesTransformer().transform(smiles)
)
else:
mol_list = smiles
features = np.concatenate(
[
_molecule_to_fingerprint_features(mol, fingerprint_encoder)
for mol in mol_list
]
)
name = f"{encoding.name}_"
prefix = prefix + "_" if prefix else ""
feature_names_out = fingerprint_encoder.get_feature_names_out()
no_descriptor_names = all("fingerprint" in f for f in feature_names_out)
suffixes = [
f.split("fingerprint")[1] if no_descriptor_names else f.replace(" ", "_")
for f in feature_names_out
]
col_names = [prefix + name + suffix for suffix in suffixes]
df = pd.DataFrame(
features, columns=col_names, dtype=active_settings.DTypeFloatNumpy
)
return df
[docs]
def get_fingerprint_class(encoding: SubstanceEncoding) -> BaseFingerprintTransformer:
"""Retrieve the fingerprint class corresponding to a given encoding.
Args:
encoding: A substance encoding.
Raises:
ValueError: If no fingerprint class for the specified encoding is found.
Returns:
The fingerprint class.
"""
# Exception case
if encoding is SubstanceEncoding.RDKITFINGERPRINT:
return fingerprints.RDKitFingerprint
try:
cls_name = next(
name
for name in dir(fingerprints)
if (encoding.name + "Fingerprint").casefold() == name.casefold()
)
except StopIteration as e:
raise ValueError(
f"No fingerprint class exists for the specified encoding '{encoding.name}'."
) from e
return getattr(fingerprints, cls_name)
[docs]
def get_canonical_smiles(smiles: str) -> str:
"""Return the "canonical" representation of the given SMILES."""
try:
return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
except Exception:
raise ValueError(f"The SMILES '{smiles}' does not appear to be valid.")