Source code for deephyper.skopt.learning.gaussian_process.gpr

import warnings

import numpy as np
import sklearn
from packaging import version
from scipy.linalg import cho_solve, solve_triangular
from sklearn.gaussian_process import (
    GaussianProcessRegressor as sk_GaussianProcessRegressor,
)
from sklearn.utils import check_array

from .kernels import RBF, ConstantKernel, Sum, WhiteKernel


def _param_for_white_kernel_in_Sum(kernel, kernel_str=""):
    """Check if a WhiteKernel exists in a Sum Kernel
    and if it does return the corresponding key in
    `kernel.get_params()`
    """
    if kernel_str != "":
        kernel_str = kernel_str + "__"

    if isinstance(kernel, Sum):
        for param, child in kernel.get_params(deep=False).items():
            if isinstance(child, WhiteKernel):
                return True, kernel_str + param
            else:
                present, child_str = _param_for_white_kernel_in_Sum(
                    child, kernel_str + param
                )
                if present:
                    return True, child_str

    return False, "_"



[docs]
class GaussianProcessRegressor(sk_GaussianProcessRegressor):
    """GaussianProcessRegressor that allows noise tunability.

    This implementation is based on Algorithm 2.1 of *Gaussian Processes for
    Machine Learning* (GPML) by Rasmussen and Williams.

    In addition to the standard scikit-learn estimator API,
    `GaussianProcessRegressor`:

        * Allows prediction without prior fitting (based on the GP prior).
        * Provides an additional method `sample_y(X)` to evaluate samples drawn
        from the GPR (prior or posterior) at given inputs.
        * Exposes a method `log_marginal_likelihood(theta)` that can be used
        externally for alternative hyperparameter selection strategies,
        such as Markov chain Monte Carlo.

    Args:
        kernel (Kernel):  
            The kernel specifying the covariance function of the GP. If None,
            the default kernel ``1.0 * RBF(1.0)`` is used. Kernel hyperparameters
            are optimized during fitting.

        alpha (float or array-like, optional):  
            Value added to the diagonal of the kernel matrix during fitting
            (default: 1e-10). Larger values correspond to higher assumed noise
            levels and improve numerical stability. If an array is provided, it
            must match the number of training samples and is interpreted as a
            per-sample noise level. Equivalent to adding a `WhiteKernel` with
            `c = alpha`. Provided mainly for convenience and consistency with
            Ridge regression.

        optimizer (str or callable, optional):  
            Optimizer for kernel parameter tuning (default: `"fmin_l_bfgs_b"`).  
            A string selects one of the built-in optimizers.  
            A callable must follow the signature:

            .. code-block:: python
            
                def optimizer(obj_func, initial_theta, bounds):
                    # obj_func: objective to maximize; accepts theta and
                    #           optionally eval_gradient
                    # initial_theta: initial hyperparameter state
                    # bounds: box constraints for theta
                    return theta_opt, func_min

            If None, kernel parameters are kept fixed.

            Available built-in optimizers:
                * `"fmin_l_bfgs_b"`

        n_restarts_optimizer (int, optional):  
            Number of optimizer restarts to maximize the log-marginal
            likelihood (default: 0). The first run uses the kernel's initial
            parameters; additional runs start from random log-uniform samples.
            If > 0, all parameter bounds must be finite. A value of 0 implies a
            single run.

        normalize_y (bool, optional):  
            Whether to normalize target values to have zero mean (default:
            False). Should be enabled when the target mean deviates significantly
            from zero. Note: this effectively alters the GP prior using the data,
            which contradicts the likelihood principle; thus the default is False.

        copy_X_train (bool, optional):  
            If True (default), a persistent copy of the training data is stored.
            If False, only a reference is kept, so external modification of the
            data may affect predictions.

        random_state (int or numpy.random.RandomState, optional):  
            Random generator used for initialization. If an integer is provided,
            it sets the seed. Defaults to NumPy's global RNG.

        noise (str, optional):  
            If set to `"gaussian"`, the model assumes that `y` is a noisy estimate
            of the latent function `f(x)` with Gaussian noise.

    Attributes:
        X_train_ (array-like of shape (n_samples, n_features)):  
            Training feature values.

        y_train_ (array-like of shape (n_samples, [n_output_dims])):  
            Training target values.

        kernel_ (Kernel):  
            The kernel used for prediction, identical in structure to the input
            kernel but with optimized hyperparameters.

        L_ (array-like of shape (n_samples, n_samples)):  
            Lower-triangular Cholesky decomposition of the kernel matrix evaluated
            at ``X_train_``.

        alpha_ (array-like of shape (n_samples,)):  
            Dual coefficients of training data points in kernel space.

        log_marginal_likelihood_value_ (float):  
            Log-marginal likelihood evaluated at ``self.kernel_.theta``.

        noise_ (float):  
            Estimated Gaussian noise level. Only relevant when ``noise="gaussian"``.
    """

    def __init__(
        self,
        kernel=None,
        alpha=1e-10,
        optimizer="fmin_l_bfgs_b",
        n_restarts_optimizer=0,
        normalize_y=False,
        copy_X_train=True,
        random_state=None,
        noise=None,
    ):
        self.noise = noise
        super(GaussianProcessRegressor, self).__init__(
            kernel=kernel,
            alpha=alpha,
            optimizer=optimizer,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y,
            copy_X_train=copy_X_train,
            random_state=random_state,
        )


[docs]
    def fit(self, X, y):
        """Fit Gaussian process regression model.

        Args:
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples, [n_output_dims])
            Target values

        Returns:
        self
            Returns an instance of self.
        """
        if isinstance(self.noise, str) and self.noise != "gaussian":
            raise ValueError("expected noise to be 'gaussian', got %s" % self.noise)

        if self.kernel is None:
            self.kernel = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(
                1.0, length_scale_bounds="fixed"
            )
        if self.noise == "gaussian":
            self.kernel = self.kernel + WhiteKernel()
        elif self.noise:
            self.kernel = self.kernel + WhiteKernel(
                noise_level=self.noise, noise_level_bounds="fixed"
            )
        super(GaussianProcessRegressor, self).fit(X, y)

        self.noise_ = None

        if self.noise:
            # The noise component of this kernel should be set to zero
            # while estimating K(X_test, X_test)
            # Note that the term K(X, X) should include the noise but
            # this (K(X, X))^-1y is precomputed as the attribute `alpha_`.
            # (Notice the underscore).
            # This has been described in Eq 2.24 of
            # http://www.gaussianprocess.org/gpml/chapters/RW2.pdf
            # Hence this hack
            if isinstance(self.kernel_, WhiteKernel):
                self.kernel_.set_params(noise_level=0.0)

            else:
                white_present, white_param = _param_for_white_kernel_in_Sum(
                    self.kernel_
                )

                # This should always be true. Just in case.
                if white_present:
                    noise_kernel = self.kernel_.get_params()[white_param]
                    self.noise_ = noise_kernel.noise_level
                    self.kernel_.set_params(
                        **{white_param: WhiteKernel(noise_level=0.0)}
                    )

        # Precompute arrays needed at prediction
        L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
        self.K_inv_ = L_inv.dot(L_inv.T)

        # Fix deprecation warning #462
        sklearn_version = version.parse(sklearn.__version__)
        if sklearn_version >= version.parse("0.23.0"):
            self.y_train_std_ = self._y_train_std
            self.y_train_mean_ = self._y_train_mean
        elif sklearn_version >= version.parse("0.19.0"):
            self.y_train_mean_ = self._y_train_mean
            self.y_train_std_ = 1
        else:
            self.y_train_mean_ = self.y_train_mean
            self.y_train_std_ = 1

        return self



[docs]
    def predict(
        self,
        X,
        return_std=False,
        return_cov=False,
        return_mean_grad=False,
        return_std_grad=False,
    ):
        """Predict output for X.

        In addition to the mean of the predictive distribution, also its
        standard deviation (return_std=True) or covariance (return_cov=True),
        the gradient of the mean and the standard-deviation with respect to X
        can be optionally provided.

        Args:
        X : array-like, shape = (n_samples, n_features)
            Query points where the GP is evaluated.

        return_std : bool, default: False
            If True, the standard-deviation of the predictive distribution at
            the query points is returned along with the mean.

        return_cov : bool, default: False
            If True, the covariance of the joint predictive distribution at
            the query points is returned along with the mean.

        return_mean_grad : bool, default: False
            Whether or not to return the gradient of the mean.
            Only valid when X is a single point.

        return_std_grad : bool, default: False
            Whether or not to return the gradient of the std.
            Only valid when X is a single point.

        Returns:
        y_mean : array, shape = (n_samples, [n_output_dims])
            Mean of predictive distribution a query points

        y_std : array, shape = (n_samples,), optional
            Standard deviation of predictive distribution at query points.
            Only returned when return_std is True.

        y_cov : array, shape = (n_samples, n_samples), optional
            Covariance of joint predictive distribution a query points.
            Only returned when return_cov is True.

        y_mean_grad : shape = (n_samples, n_features)
            The gradient of the predicted mean

        y_std_grad : shape = (n_samples, n_features)
            The gradient of the predicted std.
        """
        if return_std and return_cov:
            raise RuntimeError(
                "Not returning standard deviation of predictions when "
                "returning full covariance."
            )

        if return_std_grad and not return_std:
            raise ValueError("Not returning std_gradient without returning " "the std.")

        X = check_array(X)
        if X.shape[0] != 1 and (return_mean_grad or return_std_grad):
            raise ValueError("Not implemented for n_samples > 1")

        if not hasattr(self, "X_train_"):  # Not fit; predict based on GP prior
            y_mean = np.zeros(X.shape[0])
            if return_cov:
                y_cov = self.kernel(X)
                return y_mean, y_cov
            elif return_std:
                y_var = self.kernel.diag(X)
                return y_mean, np.sqrt(y_var)
            else:
                return y_mean

        else:  # Predict based on GP posterior
            K_trans = self.kernel_(X, self.X_train_)
            y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
            # undo normalisation
            y_mean = self.y_train_std_ * y_mean + self.y_train_mean_

            if return_cov:
                v = cho_solve((self.L_, True), K_trans.T)  # Line 5
                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
                # undo normalisation
                y_cov = y_cov * self.y_train_std_**2
                return y_mean, y_cov

            elif return_std:
                K_inv = self.K_inv_

                # Compute variance of predictive distribution
                y_var = self.kernel_.diag(X)
                y_var -= np.einsum("ki,kj,ij->k", K_trans, K_trans, K_inv)

                # Check if any of the variances is negative because of
                # numerical issues. If yes: set the variance to 0.
                y_var_negative = y_var < 0
                if np.any(y_var_negative):
                    warnings.warn(
                        "Predicted variances smaller than 0. "
                        "Setting those variances to 0."
                    )
                    y_var[y_var_negative] = 0.0
                # undo normalisation
                y_var = y_var * self.y_train_std_**2
                y_std = np.sqrt(y_var)

            if return_mean_grad:
                grad = self.kernel_.gradient_x(X[0], self.X_train_)
                grad_mean = np.dot(grad.T, self.alpha_)
                # undo normalisation
                grad_mean = grad_mean * self.y_train_std_
                if return_std_grad:
                    grad_std = np.zeros(X.shape[1])
                    if not np.allclose(y_std, grad_std):
                        grad_std = -np.dot(K_trans, np.dot(K_inv, grad))[0] / y_std
                        # undo normalisation
                        grad_std = grad_std * self.y_train_std_**2
                    return y_mean, y_std, grad_mean, grad_std

                if return_std:
                    return y_mean, y_std, grad_mean
                else:
                    return y_mean, grad_mean

            else:
                if return_std:
                    return y_mean, y_std
                else:
                    return y_mean