Source code for deephyper.sklearn.regressor._autosklearn1

"""
This module provides ``problem_autosklearn1`` and ``run_autosklearn`` for regression tasks.
"""
import warnings
from inspect import signature

import ConfigSpace as cs
from deephyper.problem import HpProblem
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor


def minmaxstdscaler() -> Pipeline:
    """MinMax preprocesssing followed by Standard normalization.

    Returns:
        Pipeline: a pipeline with two steps ``[MinMaxScaler, StandardScaler]``.
    """
    preprocessor = Pipeline(
        [
            ("minmaxscaler", MinMaxScaler()),
            ("stdscaler", StandardScaler()),
        ]
    )
    return preprocessor


REGRESSORS = {
    "RandomForest": RandomForestRegressor,
    "Linear": LinearRegression,
    "AdaBoost": AdaBoostRegressor,
    "KNeighbors": KNeighborsRegressor,
    "MLP": MLPRegressor,
    "SVR": SVR,
    "XGBoost": XGBRegressor,
}


problem_autosklearn1 = HpProblem()

regressor = problem_autosklearn1.add_hyperparameter(
    name="regressor",
    value=["RandomForest", "Linear", "AdaBoost", "KNeighbors", "MLP", "SVR", "XGBoost"],
)

# n_estimators
n_estimators = problem_autosklearn1.add_hyperparameter(
    name="n_estimators", value=(1, 2000, "log-uniform")
)

cond_n_estimators = cs.OrConjunction(
    cs.EqualsCondition(n_estimators, regressor, "RandomForest"),
    cs.EqualsCondition(n_estimators, regressor, "AdaBoost"),
)

problem_autosklearn1.add_condition(cond_n_estimators)

# max_depth
max_depth = problem_autosklearn1.add_hyperparameter(
    name="max_depth", value=(2, 100, "log-uniform")
)

cond_max_depth = cs.EqualsCondition(max_depth, regressor, "RandomForest")

problem_autosklearn1.add_condition(cond_max_depth)

# n_neighbors
n_neighbors = problem_autosklearn1.add_hyperparameter(
    name="n_neighbors", value=(1, 100)
)

cond_n_neighbors = cs.EqualsCondition(n_neighbors, regressor, "KNeighbors")

problem_autosklearn1.add_condition(cond_n_neighbors)

# alpha
alpha = problem_autosklearn1.add_hyperparameter(
    name="alpha", value=(1e-5, 10.0, "log-uniform")
)

cond_alpha = cs.EqualsCondition(alpha, regressor, "MLP")

problem_autosklearn1.add_condition(cond_alpha)

# C
C = problem_autosklearn1.add_hyperparameter(name="C", value=(1e-5, 10.0, "log-uniform"))

cond_C = cs.EqualsCondition(C, regressor, "SVR")

problem_autosklearn1.add_condition(cond_C)

# kernel
kernel = problem_autosklearn1.add_hyperparameter(
    name="kernel", value=["linear", "poly", "rbf", "sigmoid"]
)

cond_kernel = cs.EqualsCondition(kernel, regressor, "SVR")

problem_autosklearn1.add_condition(cond_kernel)

# gamma
gamma = problem_autosklearn1.add_hyperparameter(
    name="gamma", value=(1e-5, 10.0, "log-uniform")
)

cond_gamma = cs.OrConjunction(
    cs.EqualsCondition(gamma, kernel, "rbf"),
    cs.EqualsCondition(gamma, kernel, "poly"),
    cs.EqualsCondition(gamma, kernel, "sigmoid"),
)

problem_autosklearn1.add_condition(cond_gamma)


[docs]def run_autosklearn1(config: dict, load_data: callable) -> float: """Run function which can be used for AutoML regression. It has to be used with the ``deephyper.sklearn.regressor.problem_autosklearn1`` problem definition which corresponds to: .. code-block:: Configuration space object: Hyperparameters: C, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale alpha, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale gamma, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale kernel, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear max_depth, Type: UniformInteger, Range: [2, 100], Default: 14, on log-scale n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 45, on log-scale n_neighbors, Type: UniformInteger, Range: [1, 100], Default: 50 regressor, Type: Categorical, Choices: {RandomForest, Linear, AdaBoost, KNeighbors, MLP, SVR, XGBoost}, Default: RandomForest Conditions: (gamma | kernel == 'rbf' || gamma | kernel == 'poly' || gamma | kernel == 'sigmoid') (n_estimators | regressor == 'RandomForest' || n_estimators | regressor == 'AdaBoost') C | regressor == 'SVR' alpha | regressor == 'MLP' kernel | regressor == 'SVR' max_depth | regressor == 'RandomForest' n_neighbors | regressor == 'KNeighbors' Args: config (dict): an hyperparameter configuration ``dict`` corresponding to the ``deephyper.sklearn.regressor.problem_autosklearn1``. load_data (callable): a function returning data as Numpy arrays ``(X, y)``. Returns: float: returns the :math:`R^2` on the validation set. """ config["random_state"] = config.get("random_state", 42) config["n_jobs"] = config.get("n_jobs", 1) X, y = load_data() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=config["random_state"] ) preproc = minmaxstdscaler() X_train = preproc.fit_transform(X_train) X_test = preproc.transform(X_test) mapping = REGRESSORS clf_class = mapping[config["regressor"]] # keep parameters possible for the current regressor sig = signature(clf_class) clf_allowed_params = list(sig.parameters.keys()) clf_params = { k: v for k, v in config.items() if k in clf_allowed_params and not (v in ["nan", "NA"]) } try: # good practice to manage the fail value yourself... clf = clf_class(**clf_params) with warnings.catch_warnings(): warnings.simplefilter("ignore") clf.fit(X_train, y_train) fit_is_complete = True except: # noqa: E722 fit_is_complete = False if fit_is_complete: y_pred = clf.predict(X_test) r2 = r2_score(y_test, y_pred) else: r2 = -1.0 return r2
if __name__ == "__main__": print(problem_autosklearn1)