"""
This module provides ``problem_autosklearn1`` and ``run_autosklearn`` for classification tasks.
"""
import warnings
from inspect import signature
import ConfigSpace as cs
from deephyper.problem import HpProblem
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
def minmaxstdscaler() -> Pipeline:
"""MinMax preprocesssing followed by Standard normalization.
Returns:
Pipeline: a pipeline with two steps ``[MinMaxScaler, StandardScaler]``.
"""
preprocessor = Pipeline(
[
("minmaxscaler", MinMaxScaler()),
("stdscaler", StandardScaler()),
]
)
return preprocessor
problem_autosklearn1 = HpProblem()
classifier = problem_autosklearn1.add_hyperparameter(
name="classifier",
value=[
"RandomForest",
"Logistic",
"AdaBoost",
"KNeighbors",
"MLP",
"SVC",
"XGBoost",
],
)
# n_estimators
n_estimators = problem_autosklearn1.add_hyperparameter(
name="n_estimators", value=(1, 2000, "log-uniform")
)
cond_n_estimators = cs.OrConjunction(
cs.EqualsCondition(n_estimators, classifier, "RandomForest"),
cs.EqualsCondition(n_estimators, classifier, "AdaBoost"),
)
problem_autosklearn1.add_condition(cond_n_estimators)
# max_depth
max_depth = problem_autosklearn1.add_hyperparameter(
name="max_depth", value=(2, 100, "log-uniform")
)
cond_max_depth = cs.EqualsCondition(max_depth, classifier, "RandomForest")
problem_autosklearn1.add_condition(cond_max_depth)
# n_neighbors
n_neighbors = problem_autosklearn1.add_hyperparameter(
name="n_neighbors", value=(1, 100)
)
cond_n_neighbors = cs.EqualsCondition(n_neighbors, classifier, "KNeighbors")
problem_autosklearn1.add_condition(cond_n_neighbors)
# alpha
alpha = problem_autosklearn1.add_hyperparameter(
name="alpha", value=(1e-5, 10.0, "log-uniform")
)
cond_alpha = cs.EqualsCondition(alpha, classifier, "MLP")
problem_autosklearn1.add_condition(cond_alpha)
# C
C = problem_autosklearn1.add_hyperparameter(name="C", value=(1e-5, 10.0, "log-uniform"))
cond_C = cs.OrConjunction(
cs.EqualsCondition(C, classifier, "Logistic"),
cs.EqualsCondition(C, classifier, "SVC"),
)
problem_autosklearn1.add_condition(cond_C)
# kernel
kernel = problem_autosklearn1.add_hyperparameter(
name="kernel", value=["linear", "poly", "rbf", "sigmoid"]
)
cond_kernel = cs.EqualsCondition(kernel, classifier, "SVC")
problem_autosklearn1.add_condition(cond_kernel)
# gamma
gamma = problem_autosklearn1.add_hyperparameter(
name="gamma", value=(1e-5, 10.0, "log-uniform")
)
cond_gamma = cs.OrConjunction(
cs.EqualsCondition(gamma, kernel, "rbf"),
cs.EqualsCondition(gamma, kernel, "poly"),
cs.EqualsCondition(gamma, kernel, "sigmoid"),
)
problem_autosklearn1.add_condition(cond_gamma)
# Mapping available classifiers
CLASSIFIERS = {
"RandomForest": RandomForestClassifier,
"Logistic": LogisticRegression,
"AdaBoost": AdaBoostClassifier,
"KNeighbors": KNeighborsClassifier,
"MLP": MLPClassifier,
"SVC": SVC,
"XGBoost": XGBClassifier,
}
[docs]def run_autosklearn1(config: dict, load_data: callable) -> float:
"""Run function which can be used for AutoML classification.
It has to be used with the ``deephyper.sklearn.classifier.problem_autosklearn1`` problem definition which corresponds to:
.. code-block::
Configuration space object:
Hyperparameters:
C, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
alpha, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
classifier, Type: Categorical, Choices: {RandomForest, Logistic, AdaBoost, KNeighbors, MLP, SVC, XGBoost}, Default: RandomForest
gamma, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
kernel, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear
max_depth, Type: UniformInteger, Range: [2, 100], Default: 14, on log-scale
n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 45, on log-scale
n_neighbors, Type: UniformInteger, Range: [1, 100], Default: 50
Conditions:
(C | classifier == 'Logistic' || C | classifier == 'SVC')
(gamma | kernel == 'rbf' || gamma | kernel == 'poly' || gamma | kernel == 'sigmoid')
(n_estimators | classifier == 'RandomForest' || n_estimators | classifier == 'AdaBoost')
alpha | classifier == 'MLP'
kernel | classifier == 'SVC'
max_depth | classifier == 'RandomForest'
n_neighbors | classifier == 'KNeighbors'
Args:
config (dict): an hyperparameter configuration ``dict`` corresponding to the ``deephyper.sklearn.classifier.problem_autosklearn1``.
load_data (callable): a function returning data as Numpy arrays ``(X, y)``.
Returns:
float: returns the accuracy on the validation set.
"""
config["random_state"] = config.get("random_state", 42)
config["n_jobs"] = config.get("n_jobs", 1)
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=config["random_state"]
)
preproc = minmaxstdscaler()
X_train = preproc.fit_transform(X_train)
X_test = preproc.transform(X_test)
mapping = CLASSIFIERS
clf_class = mapping[config["classifier"]]
# keep parameters possible for the current classifier
sig = signature(clf_class)
clf_allowed_params = list(sig.parameters.keys())
clf_params = {
k: v
for k, v in config.items()
if k in clf_allowed_params and not (v in ["nan", "NA"])
}
try: # good practice to manage the fail value yourself...
clf = clf_class(**clf_params)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
clf.fit(X_train, y_train)
fit_is_complete = True
except: # noqa: E722
fit_is_complete = False
if fit_is_complete:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
else:
acc = -1.0
return acc
if __name__ == "__main__":
print(problem_autosklearn1)