Source code for deephyper.analysis.hpo

"""Visualization tools for Hyperparameter Optimization.
"""

from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import MaxNLocator

from deephyper.analysis import rank
from deephyper.analysis._paxplot import pax_parallel


[docs]def filter_failed_objectives(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Filter out lines from the DataFrame with failed objectives.

    Args:
        df (pd.DataFrame): the results of a Hyperparameter Search.

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: ``df_without_failures, df_with_failures`` the first are results of a Hyperparameter Search without failed objectives and the second are results of Hyperparameter search with failed objectives.
    """
    # Single-Objective
    if "objective" in df.columns:
        if pd.api.types.is_string_dtype(df.objective):
            mask = df.objective.str.startswith("F")

            df_with_failures = df[mask]

            df_without_failures = df[~mask]
            df_without_failures.loc[
                :, "objective"
            ] = df_without_failures.objective.astype(float)
        else:
            df_without_failures = df
            df_with_failures = df[np.zeros(len(df), dtype=bool)]

    # Multi-Objective
    elif "objective_0" in df.columns:
        objcol = list(df.filter(regex=r"^objective_\d+$").columns)

        mask = np.zeros(len(df), dtype=bool)
        for col in objcol:
            if pd.api.types.is_string_dtype(df[col]):
                mask = mask | df[col].str.startswith("F")

        df_with_failures = df[mask]
        df_without_failures = df[~mask]
        df_without_failures.loc[:, objcol] = df_without_failures[objcol].astype(float)
    else:
        raise ValueError(
            "The DataFrame does not contain neither a column named 'objective' nor columns named 'objective_<int>'."
        )

    return df_without_failures, df_with_failures


[docs]def parameters_at_max(
    df: pd.DataFrame, column: str = "objective"
) -> Tuple[dict, float]:
    """Return the parameters at the maximum of the objective function.

    Args:
        df (pd.DataFrame): the results of a Hyperparameter Search.
        column (str, optional): the column to use for the maximization. Defaults to ``"objective"``.

    Returns:
        Tuple[dict, float]: the parameters at the maximum of the ``column`` and its corresponding value.
    """
    df, _ = filter_failed_objectives(df)
    idx = df[column].argmax()
    value = df.iloc[idx][column]
    config = df.iloc[idx].to_dict()
    config = {k[2:]: v for k, v in config.items() if k.startswith("p:")}
    return config, value


[docs]def plot_search_trajectory_single_objective_hpo(
    results, show_failures: bool = True, ax=None, **kwargs
):
    """Plot the search trajectory of a Single-Objective Hyperparameter Search.

    Args:
        results (pd.DataFrame): the results of a Hyperparameter Search.
        show_failures (bool, optional): whether to show the failed objectives. Defaults to ``True``.
        ax (matplotlib.pyplot.axes): the axes to use for the plot.

    Returns:
        (matplotlib.pyplot.figure, matplotlib.pyplot.axes): the figure and axes of the plot.
    """

    if results.objective.dtype != np.float64:
        x = np.arange(len(results))
        mask_failed = np.where(results.objective.str.startswith("F"))[0]
        mask_success = np.where(~results.objective.str.startswith("F"))[0]
        x_success, x_failed = x[mask_success], x[mask_failed]
        y_success = results.objective[mask_success].astype(float)
    else:
        x = np.arange(len(results))
        x_success = x
        x_failed = np.array([])
        y_success = results.objective

    y_min, y_max = y_success.min(), y_success.max()
    y_min = y_min - 0.05 * (y_max - y_min)
    y_max = y_max - 0.05 * (y_max - y_min)

    scatter_kwargs = dict(marker="o", s=10, c="skyblue")
    scatter_kwargs.update(kwargs)

    fig = plt.gcf()
    if fig is None:
        fig = plt.figure()

    if ax is None:
        ax = fig.gca()

    ax.plot(x_success, y_success.cummax())
    ax.scatter(x_success, y_success, **scatter_kwargs, label="Successes")

    if show_failures and len(x_failed) > 0:
        ax.scatter(
            x_failed,
            np.full_like(x_failed, y_min),
            marker="v",
            color="red",
            label="Failures",
        )

    ax.set_xlabel("Evaluations")
    ax.set_ylabel("Objective")
    ax.legend()
    ax.grid(True)
    ax.set_xlim(x.min(), x.max())

    return fig, ax


[docs]def compile_worker_activity(results, profile_type="submit/gather"):
    """Compute the number of active workers.

    Args:
        results (pd.DataFrame): the results of a Hyperparameter Search.
        profile_type (str, optional): the type of profile to build. It can be `"submit/gather"` or `"start/end"`. Defaults to "submit/gather".

    Returns:
        timestamps, n_jobs_active: a list of timestamps and a list of the number of active jobs at each timestamp.
    """
    if profile_type == "submit/gather":
        key_start, key_end = "m:timestamp_submit", "m:timestamp_gather"
    elif profile_type == "start/end":
        key_start, key_end = "m:timestamp_start", "m:timestamp_end"
    else:
        raise ValueError(
            f"Unknown profile_type='{profile_type}' it should be one of ['submit/gather', 'start/end']."
        )

    if key_start not in results.columns or key_end not in results.columns:
        raise ValueError(
            f"Columns '{key_start}' and '{key_end}' are not present in the DataFrame."
        )

    results = results.sort_values(by=[key_start], ascending=True)

    history = []

    for _, row in results.iterrows():
        history.append((row[key_start], 1))
        history.append((row[key_end], -1))

    history = sorted(history, key=lambda v: v[0])
    nb_workers = 0
    timestamp = np.zeros((len(history) + 1,))
    n_jobs_running = np.zeros((len(history) + 1,))
    for i, (time, incr) in enumerate(history):
        nb_workers += incr
        timestamp[i + 1] = time
        n_jobs_running[i + 1] = nb_workers

    return timestamp, n_jobs_running


[docs]def plot_worker_utilization(
    results,
    num_workers: int = None,
    profile_type: str = "submit/gather",
    ax=None,
    **kwargs,
):
    """Plot the worker utilization of a search.

    Args:
        results (pd.DataFrame): the results of a Hyperparameter Search.
        num_workers (int, optional): the number of workers. If passed the normalized utilization will be shown (/num_workers). Otherwise, the raw number of active workers is shown. Defaults to ``None``.
        profile_type (str, optional): the type of profile to build. It can be `"submit/gather"` or `"start/end"`. Defaults to "submit/gather".
        ax (matplotlib.pyplot.axes): the axes to use for the plot.

    Returns:
        (matplotlib.pyplot.figure, matplotlib.pyplot.axes): the figure and axes of the plot.
    """

    x, y = compile_worker_activity(results, profile_type=profile_type)

    if num_workers:
        y = y / num_workers

    plot_kwargs = dict()
    plot_kwargs.update(kwargs)

    fig = plt.gcf()
    if fig is None:
        fig = plt.figure()

    if ax is None:
        ax = fig.gca()

    ax.plot(x, y, **plot_kwargs)

    ax.set_xlabel("Time (sec.)")
    if num_workers:
        ax.set_ylabel("Utilization")
    else:
        ax.set_ylabel("Active Workers")
    ax.legend()
    ax.grid(True)
    ax.set_xlim(x.min(), x.max())

    return fig, ax


[docs]def add_colorbar_px(paxfig, data, cmap="viridis", colorbar_kwargs={}):

    # Attribute
    paxfig._pax_colorbar = True

    # Local vars
    n_lines = len(paxfig.axes[0].lines)
    n_axes = len(paxfig.axes)

    vmin = data.min()
    vmax = data.max()
    # Change line colors
    for i in range(n_lines):
        # Get value
        # Get color
        # color = paxfig._get_color_gradient(scale_val, 0, 1, cmap)
        color = (data[i] - vmin) / (vmax - vmin)
        # Assign color to line
        for j in paxfig.axes[:-1]:
            j.lines[i].set_color(cmap(color))

    # Create blank axis for colorbar
    width_ratios = paxfig.axes[0].get_gridspec().get_width_ratios()
    new_n_axes = n_axes + 1
    new_width_ratios = width_ratios + [0.5]
    gs = paxfig.add_gridspec(1, new_n_axes, width_ratios=new_width_ratios)
    ax_colorbar = paxfig.add_subplot(gs[0, n_axes])

    # Create colorbar
    sm = plt.cm.ScalarMappable(
        norm=plt.Normalize(vmin=data.min(), vmax=data.max()), cmap=cmap
    )
    cbar = paxfig.colorbar(
        sm, orientation="vertical", ax=ax_colorbar, **colorbar_kwargs
    )
    cbar.locator = MaxNLocator(integer=True)
    cbar.update_ticks()

    main_ax_pos = paxfig.axes[-1].get_position()
    cbar_ax = cbar.ax
    cbar_ax.set_position(
        [
            main_ax_pos.x1 + 0.03,  # X position (left)
            main_ax_pos.y0,  # Y position (bottom)
            0.02,  # Width of colorbar
            main_ax_pos.height,  # Height of colorbar
        ]
    )

    # Figure formatting
    for i in range(n_axes):
        paxfig.axes[i].set_subplotspec(gs[0:1, i : i + 1])
    ax_colorbar.set_axis_off()
    return paxfig


[docs]def plot_parallel_coordinate(
    results,
    parameters_columns=None,
    objective_column="objective",
    rank_mode="min",
    highlight=True,
    constant_predictor=0.035726056,
):
    """Plot a parallel coordinate plot of the hyperparameters.
    Args:
        results (pd.DataFrame): the results of a Hyperparameter Search.
        parameters_columns (list, optional): list of columns to include in the plot.
        objective_column (str): name of the objective column
        rank_mode (str, optional): mode of ranking. Defaults to "min".
        highlight (bool, optional): whether to highlight the best solutions. Defaults to True.
        constant_predictor (float, optional): value to compare the objective to. Defaults to 0.035726056.
    Returns:
        fig: figure of the parallel coordinate plot
    """

    if parameters_columns is None:
        cols = [c for c in results.columns if c.startswith("p:")]
    else:
        cols = parameters_columns

    cols += [objective_column]

    results = results.copy()

    results, _ = filter_failed_objectives(results)

    for col in results.columns:
        if results[col].dtype == bool:
            results[col] = results[col].astype(str)

    results = results[cols]

    plt.rcParams["font.family"] = "Arial"
    plt.rcParams["xtick.labelsize"] = 12  # Set the X-axis tick label font size
    plt.rcParams["ytick.labelsize"] = 12
    plt.rcParams["font.size"] = 16
    plt.rcParams["axes.linewidth"] = 1
    plt.rcParams["font.weight"] = "bold"
    plt.rcParams["axes.labelweight"] = "bold"
    plt.rcParams["axes.titleweight"] = "bold"

    cmap = LinearSegmentedColormap.from_list(
        "my_gradient",
        (
            # Edit this gradient at https://eltos.github.io/gradient/#0:00D0FF-33.3:0000FF-66.7:FF0000-100:FFD800
            (0.000, (0.000, 0.816, 1.000)),
            (0.333, (0.000, 0.000, 1.000)),
            (0.667, (1.000, 0.000, 0.000)),
            (1.000, (1.000, 0.847, 0.000)),
        ),
    )

    # Add labels
    def convertCols(cols):
        names = []
        for c in cols:
            c = c.replace("p:", "")
            c = c.replace("_", " ")
            c = c.split()
            c = [word.capitalize() for word in c]
            names.append(" ".join(c))
        return names

    cols = convertCols(cols)

    if highlight:

        # Split data
        paxfig = pax_parallel(n_axes=len(cols))
        if rank_mode == "min":
            df_highlight = results[results[objective_column] < constant_predictor]
            df_highlight["rank"] = rank(df_highlight[objective_column])
            df_grey = results[results[objective_column] >= constant_predictor]
        elif rank_mode == "max":
            df_highlight = results[results[objective_column] > constant_predictor]
            df_highlight["rank"] = rank(-df_highlight[objective_column])
            df_grey = results[results[objective_column] <= constant_predictor]

        paxfig.plot(df_highlight.to_numpy()[:, :-1], line_kwargs={"alpha": 0.5})

        paxfig.set_labels(cols)

        paxfig = add_colorbar_px(
            paxfig=paxfig,
            data=df_highlight["rank"].to_numpy(),
            # data=df_highlight['rank'].to_numpy(),
            cmap=cmap,
            colorbar_kwargs={"label": "Rank"},
        )

        try:
            paxfig.plot(
                df_grey.to_numpy(),
                line_kwargs={"alpha": 0.1, "color": "grey", "zorder": 0},
            )
        except:  # noqa
            pass

    else:
        paxfig = pax_parallel(n_axes=len(cols))
        paxfig.plot(results[cols].to_numpy(), line_kwargs={"alpha": 0.5})

        # Add colorbar
        color_col = len(cols) - 1
        paxfig.add_colorbar(
            ax_idx=color_col, cmap=cmap, colorbar_kwargs={"label": "Rank"}
        )

    fig = plt.gcf()
    ax = fig.gca()
    # h = 4
    # fig.set_size_inches((len(paxfig.axes) / 2.0) * h, h)
    # plt.show()

    return fig, ax