import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.utils import check_random_state
from deephyper.skopt.joblib import Parallel, delayed
def _parallel_fit(regressor, X, y):
return regressor.fit(X, y)
[docs]
class GradientBoostingQuantileRegressor(BaseEstimator, RegressorMixin):
"""Predict several quantiles with one estimator.
This is a wrapper around ``GradientBoostingRegressor``'s quantile regression
that allows you to predict several quantiles in one go.
Args:
quantiles (array-like): Quantiles to predict. By default, the 16%, 50%,
and 84% quantiles are predicted.
base_estimator (GradientBoostingRegressor or None): Quantile regressor
used to make predictions. Only instances of
``GradientBoostingRegressor`` are supported. Use this to change the
hyper-parameters of the estimator. Defaults to None.
n_jobs (int): Number of jobs to run in parallel for ``fit``.
If -1, then the number of jobs is set to the number of cores.
Defaults to 1.
random_state (int | RandomState | None): Controls the randomness of the
estimator. Set this to something other than ``None`` for reproducible
results. Defaults to None.
"""
def __init__(
self,
quantiles=[0.16, 0.5, 0.84],
base_estimator=None,
n_jobs=1,
random_state=None,
):
self.quantiles = quantiles
self.random_state = random_state
self.base_estimator = base_estimator
self.n_jobs = n_jobs
def set_params(self, **params):
self.base_estimator.set_params(**params)
return self
[docs]
def fit(self, X, y):
"""Fit one regressor for each quantile.
Args:
X : array-like, shape=(n_samples, n_features)
Training vectors, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : array-like, shape=(n_samples,)
Target values (real numbers in regression)
"""
rng = check_random_state(self.random_state)
if self.base_estimator is None:
base_estimator = GradientBoostingRegressor(loss="quantile")
else:
base_estimator = self.base_estimator
if not isinstance(
base_estimator,
(GradientBoostingRegressor, HistGradientBoostingRegressor),
):
raise ValueError(
"base_estimator has to be of type"
" GradientBoostingRegressor or HistGradientBoostingRegressor."
)
if not base_estimator.loss == "quantile":
raise ValueError(
"base_estimator has to use quantile"
" loss not %s" % base_estimator.loss
)
# The predictions for different quantiles should be sorted.
# Therefore each of the regressors need the same seed.
base_estimator.set_params(random_state=rng)
regressors = []
for q in self.quantiles:
regressor = clone(base_estimator)
if isinstance(regressor, GradientBoostingRegressor):
regressor.set_params(alpha=q)
elif isinstance(regressor, HistGradientBoostingRegressor):
regressor.set_params(quantile=q)
regressors.append(regressor)
self.regressors_ = Parallel(n_jobs=self.n_jobs, prefer="threads")(
delayed(_parallel_fit)(regressor, X, y) for regressor in regressors
)
return self
[docs]
def predict(self, X, return_std=False, return_quantiles=False):
"""Predict.
Predict `X` at every quantile if `return_std` is set to False.
If `return_std` is set to True, then return the mean
and the predicted standard deviation, which is approximated as
the (0.84th quantile - 0.16th quantile) divided by 2.0
Args:
X : array-like, shape=(n_samples, n_features)
where `n_samples` is the number of samples
and `n_features` is the number of features.
"""
predicted_quantiles = np.asarray([rgr.predict(X) for rgr in self.regressors_])
if return_quantiles:
return predicted_quantiles.T
elif return_std:
std_quantiles = [0.16, 0.5, 0.84]
is_present_mask = np.in1d(std_quantiles, self.quantiles)
if not np.all(is_present_mask):
raise ValueError(
"return_std works only if the quantiles during "
"instantiation include 0.16, 0.5 and 0.84"
)
low = self.regressors_[self.quantiles.index(0.16)].predict(X)
high = self.regressors_[self.quantiles.index(0.84)].predict(X)
mean = self.regressors_[self.quantiles.index(0.5)].predict(X)
std = (high - low) / 2.0
# This avoids NaN when computing the Negative Log-likelihood
std[std <= 0.01] = 0.01
return mean, std
# return the mean
return self.regressors_[self.quantiles.index(0.5)].predict(X)