Source code for baybe.surrogates.gaussian_process.presets.edbo
"""EDBO preset for Gaussian process surrogates."""
from __future__ import annotations
import gc
from collections.abc import Collection
from typing import TYPE_CHECKING
from attrs import define
from typing_extensions import override
from baybe.kernels.basic import MaternKernel
from baybe.kernels.composite import ScaleKernel
from baybe.parameters import TaskParameter
from baybe.parameters.enum import SubstanceEncoding
from baybe.parameters.substance import SubstanceParameter
from baybe.priors.basic import GammaPrior
from baybe.searchspace.discrete import SubspaceDiscrete
from baybe.surrogates.gaussian_process.kernel_factory import KernelFactory
if TYPE_CHECKING:
from torch import Tensor
from baybe.kernels.base import Kernel
from baybe.searchspace.core import SearchSpace
def _contains_encoding(
subspace: SubspaceDiscrete, encodings: Collection[SubstanceEncoding]
) -> bool:
"""Tell if any of the substance parameters uses one of the specified encodings."""
return any(
p.encoding in encodings
for p in subspace.parameters
if isinstance(p, SubstanceParameter)
)
_EDBO_ENCODINGS = (
SubstanceEncoding.MORDRED,
SubstanceEncoding.RDKIT,
SubstanceEncoding.RDKIT2DDESCRIPTORS,
)
"""Encodings relevant to EDBO logic."""
[docs]
@define
class EDBOKernelFactory(KernelFactory):
"""A factory providing the kernel for Gaussian process surrogates adapted from EDBO.
References:
* https://github.com/b-shields/edbo/blob/master/edbo/bro.py#L664
* https://doi.org/10.1038/s41586-021-03213-y
"""
@override
def __call__(
self, searchspace: SearchSpace, train_x: Tensor, train_y: Tensor
) -> Kernel:
effective_dims = train_x.shape[-1] - len(
[p for p in searchspace.parameters if isinstance(p, TaskParameter)]
)
switching_condition = _contains_encoding(
searchspace.discrete, _EDBO_ENCODINGS
) and (effective_dims >= 50)
# low D priors
if effective_dims < 5:
lengthscale_prior = GammaPrior(1.2, 1.1)
lengthscale_initial_value = 0.2
outputscale_prior = GammaPrior(5.0, 0.5)
outputscale_initial_value = 8.0
# DFT optimized priors
elif switching_condition and effective_dims < 100:
lengthscale_prior = GammaPrior(2.0, 0.2)
lengthscale_initial_value = 5.0
outputscale_prior = GammaPrior(5.0, 0.5)
outputscale_initial_value = 8.0
# Mordred optimized priors
elif switching_condition:
lengthscale_prior = GammaPrior(2.0, 0.1)
lengthscale_initial_value = 10.0
outputscale_prior = GammaPrior(2.0, 0.1)
outputscale_initial_value = 10.0
# OHE optimized priors
else:
lengthscale_prior = GammaPrior(3.0, 1.0)
lengthscale_initial_value = 2.0
outputscale_prior = GammaPrior(5.0, 0.2)
outputscale_initial_value = 20.0
return ScaleKernel(
MaternKernel(
nu=2.5,
lengthscale_prior=lengthscale_prior,
lengthscale_initial_value=lengthscale_initial_value,
),
outputscale_prior=outputscale_prior,
outputscale_initial_value=outputscale_initial_value,
)
def _edbo_noise_factory(
searchspace: SearchSpace, train_x: Tensor, train_y: Tensor
) -> tuple[GammaPrior, float]:
"""Create the default noise settings for the Gaussian process surrogate.
The logic is adapted from EDBO (Experimental Design via Bayesian Optimization).
References:
* https://github.com/b-shields/edbo/blob/master/edbo/bro.py#L664
* https://doi.org/10.1038/s41586-021-03213-y
"""
# TODO: Replace this function with a proper likelihood factory
effective_dims = train_x.shape[-1] - len(
[p for p in searchspace.parameters if isinstance(p, TaskParameter)]
)
switching_condition = _contains_encoding(
searchspace.discrete, _EDBO_ENCODINGS
) and (effective_dims >= 50)
# low D priors
if effective_dims < 5:
return (GammaPrior(1.05, 0.5), 0.1)
# DFT optimized priors
elif switching_condition and effective_dims < 100:
return (GammaPrior(1.5, 0.1), 5.0)
# Mordred optimized priors
elif switching_condition:
return (GammaPrior(1.5, 0.1), 5.0)
# OHE optimized priors
else:
return (GammaPrior(1.5, 0.1), 5.0)
# Collect leftover original slotted classes processed by `attrs.define`
gc.collect()