Source code for deephyper.ensemble.aggregator._mode

from typing import List, Optional, Union, Dict

import numpy as np

from deephyper.ensemble.aggregator._aggregator import Aggregator



[docs]
class ModeAggregator(Aggregator):
    """Aggregate predictions using the mode of categorical distributions from predictors.

    .. list-table::
        :widths: 25 25
        :header-rows: 1

        * - Array (Fixed Set)
          - MaskedArray
        * - ✅
          - ✅

    This aggregator is useful when the ensemble is composed of predictors that output categorical
    distributions. The mode of the ensemble is the mode of the modes of the predictors, minimizing
    the 0-1 loss.

    Args:
        with_uncertainty (bool, optional): a boolean that sets if the uncertainty should be
        returned when calling the aggregator. Defaults to ``False``.
    """

    def __init__(self, with_uncertainty: bool = False):
        self.with_uncertainty = with_uncertainty


[docs]
    def aggregate(
        self,
        y: List[Union[np.ndarray, np.ma.MaskedArray]],
        weights: Optional[List[float]] = None,
    ) -> Union[
        Union[np.ndarray, np.ma.MaskedArray],
        Dict[str, Union[np.ndarray, np.ma.MaskedArray]],
    ]:
        """Aggregate predictions using the mode of categorical distributions.

        Args:
            y (List[Union[np.ndarray, np.ma.MaskedArray]]): List of categorical probability arrays
                of shape ``(n_predictors, n_samples, ..., n_classes)``.
            weights (Optional[List[float]]): Weights for the predictors. Default is ``None``.

        Returns:
            Union[Union[np.ndarray, np.ma.MaskedArray], Dict[str, Union[np.ndarray,
            np.ma.MaskedArray]]]: Aggregated results, as an array corresponding to the mode when
            ``with_uncertainty=False`` and as a dict otherwise including:

            - ``"loc"``: Aggregated mode of shape ``(n_samples, ...)``.
            - ``"uncertainty"``: Uncertainty values of shape ``(n_samples, ...)``.

        Raises:
            ValueError: If `y` dimensions are invalid or if `weights` length does not match `y`.
        """
        if not isinstance(y, list) or not all(
            isinstance(arr, (np.ndarray, np.ma.MaskedArray)) for arr in y
        ):
            raise TypeError("Input `y` must be a list of numpy.ndarray or numpy.ma.MaskedArray.")

        self._np = np
        is_masked = False
        if all(isinstance(pred, np.ma.MaskedArray) for pred in y):
            self._np = np.ma
            is_masked = True

        # Categorical probabilities (n_predictors, n_samples, ..., n_classes)
        y_proba_models = self._np.stack(y, axis=0)
        n_predictors = y_proba_models.shape[0]
        num_classes = y_proba_models.shape[-1]

        # Mode of the ensemble (n_samples, ...)
        y_mode_models = self._np.argmax(y_proba_models, axis=-1)

        weighted_counts = self._np.zeros_like(y_proba_models, dtype=np.float64).sum(axis=0)
        eye_arr = np.eye(num_classes, dtype=np.float64)
        for i in range(n_predictors):
            if weights is None:
                weighted_counts += eye_arr[y_mode_models[i]] / n_predictors
            else:
                weighted_counts += eye_arr[y_mode_models[i]] * weights[i]

        y_mode_ensemble = weighted_counts.argmax(axis=-1)
        if is_masked:
            mask = weighted_counts.sum(axis=-1).mask
            y_mode_ensemble = self._np.array(y_mode_ensemble, mask=mask)

        if not self.with_uncertainty:
            return y_mode_ensemble
        else:
            # Uncertainty of ensemble
            uncertainty = 1 - self._np.max(weighted_counts, axis=-1)

            return {
                "loc": y_mode_ensemble,
                "uncertainty": uncertainty,
            }