Source code for deephyper.hpo.gmm

"""A module for a sampler based on a Gaussian Mixture Model."""

import warnings

import numpy as np
import pandas as pd
from ConfigSpace.hyperparameters import (
    CategoricalHyperparameter,
    FloatHyperparameter,
    IntegerHyperparameter,
    OrdinalHyperparameter,
)
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.utils import check_random_state

__all__ = ["GMMSampler"]


[docs] class GMMSampler: """Gaussian Mixture Model sampler. Args: config_space (ConfigSpace): the configuration space used to check the conditions on the samples generated from the gaussian mixture model. random_state (Union[int,RandomState], optional): a random state for the sampler. Defaults to ``None``. """ def __init__(self, config_space, random_state=None): self.config_space = config_space self.rng = check_random_state(random_state) self.categorical_cols = [] self.ordinal_cols = [] self.integer_cols = [] self.float_cols = [] self.numerical_cols = [] self.categorical_encoder = None self.ordinal_encoder = None self.numerical_encoder = None self.gmm = None
[docs] def check_variable_types(self, df): """Utility that checks the columns of the dataframe against the config space.""" # Check variable types self.categorical_cols = [] self.ordinal_cols = [] self.integer_cols = [] self.float_cols = [] # for hp_name in self.config_space.keys(): for hp_name in list(df.columns): try: hp = self.config_space[hp_name] except KeyError: warnings.warn( f"Skipping hyperparameter: '{hp_name}' as it is not included in the space." ) continue if isinstance(hp, CategoricalHyperparameter): self.categorical_cols.append(hp_name) elif isinstance(hp, OrdinalHyperparameter): self.ordinal_cols.append(hp_name) elif isinstance(hp, IntegerHyperparameter): self.integer_cols.append(hp_name) elif isinstance(hp, FloatHyperparameter): self.float_cols.append(hp_name) else: raise ValueError(f"Incompatible hyperparameter {hp}") self.numerical_cols = self.integer_cols + self.float_cols
[docs] def fit(self, df: pd.DataFrame): """Fits the Gaussian mixture model. Args: df (pd.DataFrame): the dataframe used to fit the model. """ n_samples = df.shape[0] self.check_variable_types(df) categorical_categories = [] for hp_name in self.categorical_cols: hp = self.config_space[hp_name] categorical_categories.append(list(hp.choices)) self.categorical_encoder = OneHotEncoder( categories=categorical_categories, sparse_output=False ) ordinal_categories = [] for hp_name in self.ordinal_cols: hp = self.config_space[hp_name] ordinal_categories.append(list(hp.sequence)) self.ordinal_encoder = OrdinalEncoder(categories=ordinal_categories) self.numerical_encoder = StandardScaler() if len(self.categorical_cols) > 0: X_cat = self.categorical_encoder.fit_transform(df[self.categorical_cols].values) else: X_cat = np.array([[]]).reshape(n_samples, 0) if len(self.ordinal_cols) > 0: X_ord = self.ordinal_encoder.fit_transform(df[self.ordinal_cols].values) else: X_ord = np.array([[]]).reshape(n_samples, 0) if len(self.numerical_cols) > 0: X_num = self.numerical_encoder.fit_transform(df[self.numerical_cols].values) else: X_num = np.array([[]]).reshape(n_samples, 0) self.n_X_cat = X_cat.shape[1] self.n_X_ord = X_ord.shape[1] self.n_X_num = X_num.shape[1] X = np.hstack([X_cat, X_ord, X_num]) self.gmm = GaussianMixture(n_components=5, random_state=self.rng) self.gmm.fit(X)
[docs] def sample(self, n_samples: int) -> pd.DataFrame: """Generates samples from the Gaussian mixture model. Args: n_samples (int): the number of samples to generate. Returns: pd.DataFrame: a dataframe with the generated samples. """ X = self.gmm.sample(n_samples)[0] # Enforce constraints for each variable # Categorical if self.n_X_cat > 0: X_cat = self.categorical_encoder.inverse_transform(X[:, : self.n_X_cat]) else: X_cat = np.array([[]]).reshape(n_samples, 0) # Ordinal if self.n_X_ord > 0: X_ord = X[:, self.n_X_cat : self.n_X_cat + self.n_X_ord] for i, hp_name in enumerate(self.ordinal_cols): categories = self.ordinal_encoder.categories_[i] X_ord[:, i] = np.clip(X_ord[:, i], a_min=0, a_max=len(categories) - 1).astype(int) X_ord = self.ordinal_encoder.inverse_transform(X_ord) else: X_ord = np.array([[]]).reshape(n_samples, 0) # Numerical if self.n_X_num: X_num = self.numerical_encoder.inverse_transform(X[:, self.n_X_cat + self.n_X_ord :]) for i, hp_name in enumerate(self.numerical_cols): hp = self.config_space[hp_name] X_num[:, i] = np.clip(X_num[:, i], a_min=hp.lower, a_max=hp.upper) else: X_num = np.array([[]]).reshape(n_samples, 0) # Integer if len(self.integer_cols) > 0: X_num[:, : len(self.integer_cols)] = np.round( X_num[:, : len(self.integer_cols)] ).astype(int) X = np.hstack([X_cat, X_ord, X_num]) df = ( pd.DataFrame( data=X, columns=self.categorical_cols + self.ordinal_cols + self.numerical_cols, ) .astype({k: int for k in self.integer_cols}) .astype({k: float for k in self.float_cols}) ) return df