Source code for deephyper.ensemble._bagging_ensemble

import os
import traceback

import tensorflow as tf
import numpy as np
import ray

from deephyper.nas.metrics import selectMetric
from deephyper.ensemble import BaseEnsemble
from deephyper.nas.run._util import set_memory_growth_for_visible_gpus


def mse(y_true, y_pred):
    return tf.square(y_true - y_pred)


@ray.remote(num_cpus=1)
def model_predict(model_path, X, batch_size=32, verbose=0):
    """Perform an inference of the model located at ``model_path``.

    :meta private:

    Args:
        model_path (str): Path to the ``h5`` file to load to perform the inferencec.
        X (array): array of input data for which we perform the inference.
        batch_size (int, optional): Batch size used to perform the inferencec. Defaults to 32.
        verbose (int, optional): Verbose option. Defaults to 0.

    Returns:
        array: The prediction based on the provided input data.
    """

    # GPU Configuration if available
    set_memory_growth_for_visible_gpus(True)
    tf.keras.backend.clear_session()
    model_file = model_path.split("/")[-1]

    try:
        if verbose:
            print(f"Loading model {model_file}", flush=True)
        model = tf.keras.models.load_model(model_path, compile=False)
    except Exception:
        if verbose:
            print(f"Could not load model {model_file}", flush=True)
            traceback.print_exc()
        model = None

    if model:
        y = model.predict(X, batch_size=batch_size)
    else:
        y = None

    return y


class BaggingEnsemble(BaseEnsemble):
    """Ensemble based on uniform averaging of the predictions of each members.

    :meta private:

    Args:
        model_dir (str): Path to directory containing saved Keras models in .h5 format.
        loss (callable): a callable taking (y_true, y_pred) as input.
        size (int, optional): Number of unique models used in the ensemble. Defaults to 5.
        verbose (bool, optional): Verbose mode. Defaults to True.
        ray_address (str, optional): Address of the Ray cluster. If "auto" it will try to connect to an existing cluster. If "" it will start a local Ray cluster. Defaults to "".
        num_cpus (int, optional): Number of CPUs allocated to load one model and predict. Defaults to 1.
        num_gpus (int, optional): Number of GPUs allocated to load one model and predict. Defaults to None.
        batch_size (int, optional): Batch size used batchify the inference of loaded models. Defaults to 32.
        selection (str, optional): Selection strategy to build the ensemble. Value in ``["topk"]``. Default to ``topk``.
        mode (str, optional): Value in ``["regression", "classification"]``. Default to ``"regression"``.
    """

    def __init__(
        self,
        model_dir,
        loss=mse,
        size=5,
        verbose=True,
        ray_address="",
        num_cpus=1,
        num_gpus=None,
        batch_size=32,
        selection="topk",
        mode="regression",
    ):
        super().__init__(
            model_dir,
            loss,
            size,
            verbose,
            ray_address,
            num_cpus,
            num_gpus,
            batch_size,
        )
        assert selection in ["topk"]
        self.selection = selection
        assert mode in ["regression", "classification"]
        self.mode = mode

    def __repr__(self) -> str:
        out = super().__repr__()
        out += f"Mode: {self.mode}\n"
        out += f"Selection: {self.selection}\n"
        return out

    def fit(self, X, y):
        """Fit the current algorithm to the provided data.

        Args:
            X (array): The input data.
            y (array): The output data.

        Returns:
            BaseEnsemble: The current fitted instance.
        """
        X_id = ray.put(X)
        model_files = self._list_files_in_model_dir()

        def model_path(f):
            return os.path.join(self.model_dir, f)

        y_pred = ray.get(
            [
                model_predict.options(
                    num_cpus=self.num_cpus, num_gpus=self.num_gpus
                ).remote(model_path(f), X_id, self.batch_size, self.verbose)
                for f in model_files
            ]
        )
        y_pred = np.array([arr for arr in y_pred if arr is not None])

        members_indexes = topk(self.loss, y_true=y, y_pred=y_pred, k=self.size)
        self.members_files = [model_files[i] for i in members_indexes]

        return self

    def predict(self, X) -> np.ndarray:
        """Execute an inference of the ensemble for the provided data.

        Args:
            X (array): An array of input data.

        Returns:
            array: The prediction.
        """
        # make predictions
        X_id = ray.put(X)

        def model_path(f):
            os.path.join(self.model_dir, f)

        y_pred = ray.get(
            [
                model_predict.options(
                    num_cpus=self.num_cpus, num_gpus=self.num_gpus
                ).remote(model_path(f), X_id, self.batch_size, self.verbose)
                for f in self.members_files
            ]
        )
        y_pred = np.array([arr for arr in y_pred if arr is not None])

        y = aggregate_predictions(y_pred, regression=(self.mode == "regression"))

        return y

    def evaluate(self, X, y, metrics=None):
        """Compute metrics based on the provided data.

        Args:
            X (array): An array of input data.
            y (array): An array of true output data.
            metrics (callable, optional): A metric. Defaults to None.
        """
        scores = {}

        y_pred = self.predict(X)

        scores["loss"] = tf.reduce_mean(self.loss(y, y_pred)).numpy()
        if metrics:
            for metric_name in metrics:
                scores[metric_name] = apply_metric(metric_name, y, y_pred)

        return scores


[docs]class BaggingEnsembleRegressor(BaggingEnsemble):
    """Ensemble for regression based on uniform averaging of the predictions of each members.


    Args:
        model_dir (str): Path to directory containing saved Keras models in .h5 format.
        loss (callable): a callable taking (y_true, y_pred) as input.
        size (int, optional): Number of unique models used in the ensemble. Defaults to 5.
        verbose (bool, optional): Verbose mode. Defaults to True.
        ray_address (str, optional): Address of the Ray cluster. If "auto" it will try to connect to an existing cluster. If "" it will start a local Ray cluster. Defaults to "".
        num_cpus (int, optional): Number of CPUs allocated to load one model and predict. Defaults to 1.
        num_gpus (int, optional): Number of GPUs allocated to load one model and predict. Defaults to None.
        batch_size (int, optional): Batch size used batchify the inference of loaded models. Defaults to 32.
        selection (str, optional): Selection strategy to build the ensemble. Value in ``["topk"]``. Default to ``topk``.
    """

    def __init__(
        self,
        model_dir,
        loss=mse,
        size=5,
        verbose=True,
        ray_address="",
        num_cpus=1,
        num_gpus=None,
        selection="topk",
    ):
        super().__init__(
            model_dir,
            loss,
            size,
            verbose,
            ray_address,
            num_cpus,
            num_gpus,
            selection,
            mode="regression",
        )


[docs]class BaggingEnsembleClassifier(BaggingEnsemble):
    """Ensemble for classification based on uniform averaging of the predictions of each members.


    Args:
        model_dir (str): Path to directory containing saved Keras models in .h5 format.
        loss (callable): a callable taking (y_true, y_pred) as input.
        size (int, optional): Number of unique models used in the ensemble. Defaults to 5.
        verbose (bool, optional): Verbose mode. Defaults to True.
        ray_address (str, optional): Address of the Ray cluster. If "auto" it will try to connect to an existing cluster. If "" it will start a local Ray cluster. Defaults to "".
        num_cpus (int, optional): Number of CPUs allocated to load one model and predict. Defaults to 1.
        num_gpus (int, optional): Number of GPUs allocated to load one model and predict. Defaults to None.
        batch_size (int, optional): Batch size used batchify the inference of loaded models. Defaults to 32.
        selection (str, optional): Selection strategy to build the ensemble. Value in ``["topk"]``. Default to ``topk``.
    """

    def __init__(
        self,
        model_dir,
        loss=mse,
        size=5,
        verbose=True,
        ray_address="",
        num_cpus=1,
        num_gpus=None,
        selection="topk",
    ):
        super().__init__(
            model_dir,
            loss,
            size,
            verbose,
            ray_address,
            num_cpus,
            num_gpus,
            selection,
            mode="classification",
        )


def apply_metric(metric_name, y_true, y_pred) -> float:
    """Perform the computation of provided metric.

    :meta private:

    Args:
        metric_name (str|callable): If ``str`` then it needs to be a metric available in ``deephyper.nas.metrics``.
        y_true (array): Array of true predictions.
        y_pred (array): Array of predicted predictions

    Returns:
        float: a scalar value of the computed metric.
    """
    metric_func = selectMetric(metric_name)
    metric = tf.reduce_mean(
        metric_func(
            tf.convert_to_tensor(y_true, dtype=np.float32),
            tf.convert_to_tensor(y_pred, dtype=np.float32),
        )
    ).numpy()
    return metric


def aggregate_predictions(y_pred, regression=True):
    """Build an ensemble from predictions.

    :meta private:

    Args:
        ensemble_members (np.array): Indexes of selected members in the axis-0 of y_pred.
        y_pred (np.array): Predictions array of shape (n_models, n_samples, n_outputs).
        regression (bool): Boolean (True) if it is a regression (False) if it is a classification.
    Return:
        A TFP Normal Distribution in the case of regression and a np.array with average probabilities
        in the case of classification.
    """
    n = np.shape(y_pred)[0]
    y_pred = np.sum(y_pred, axis=0)
    if regression:
        agg_y_pred = y_pred / n
    else:  # classification
        agg_y_pred = np.argmax(y_pred, axis=1)
    return agg_y_pred


def topk(loss_func, y_true, y_pred, k=2) -> list:
    """Select the Top-k models to be part of the ensemble. A model can appear only once in the ensemble for this strategy.

    :meta private:

    Args:
        loss_func (callable): loss function.
        y_true (array): Array of true predictions.
        y_pred (array): Array of predicted predictions
        k (int, optional): Number of models composing the ensemble. Defaults to 2.

    Returns:
        list: a list of model indexes composing the ensembles.
    """
    # losses is of shape: (n_models, n_outputs)
    losses = tf.reduce_mean(loss_func(y_true, y_pred), axis=1).numpy()
    ensemble_members = np.argsort(losses, axis=0)[:k].reshape(-1).tolist()
    return ensemble_members