Source code for selene_sdk.utils.performance_metrics

"""
This module provides the `PerformanceMetrics` class and supporting
functionality for tracking and computing model performance.
"""
from collections import defaultdict, namedtuple
import logging
import os

import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from scipy.stats import rankdata


logger = logging.getLogger("selene")


Metric = namedtuple("Metric", ["fn", "data"])
"""
A tuple containing a metric function and the results from applying that
metric to some values.

Parameters
----------
fn : types.FunctionType
    A metric.
data : list(float)
    A list holding the results from applying the metric.

Attributes
----------
fn : types.FunctionType
    A metric.
data : list(float)
    A list holding the results from applying the metric.

"""


[docs]def visualize_roc_curves(prediction,
                         target,
                         output_dir,
                         report_gt_feature_n_positives=50,
                         style="seaborn-colorblind",
                         fig_title="Feature ROC curves",
                         dpi=500):
    """
    Output the ROC curves for each feature predicted by a model
    as an SVG.

    Parameters
    ----------
    prediction : numpy.ndarray
        Value predicted by user model.
    target : numpy.ndarray
        True value that the user model was trying to predict.
    output_dir : str
        The path to the directory to output the figures. Directories that
        do not currently exist will be automatically created.
    report_gt_feature_n_positives : int, optional
        Default is 50. Do not visualize an ROC curve for a feature with
        less than 50 positive examples in `target`.
    style : str, optional
        Default is "seaborn-colorblind". Specify a style available in
        `matplotlib.pyplot.style.available` to use.
    fig_title : str, optional
        Default is "Feature ROC curves". Set the figure title.
    dpi : int, optional
        Default is 500. Specify dots per inch (resolution) of the figure.

    Returns
    -------
    None
        Outputs the figure in `output_dir`.

    """
    os.makedirs(output_dir, exist_ok=True)

    import matplotlib
    backend = matplotlib.get_backend()
    if "inline" not in backend:
        matplotlib.use("SVG")
    import matplotlib.pyplot as plt

    plt.style.use(style)
    plt.figure()
    for index, feature_preds in enumerate(prediction.T):
        feature_targets = target[:, index]
        if len(np.unique(feature_targets)) > 1 and \
                np.sum(feature_targets) > report_gt_feature_n_positives:
            fpr, tpr, _ = roc_curve(feature_targets, feature_preds)
            plt.plot(fpr, tpr, 'r-', color="black", alpha=0.3, lw=1)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if fig_title:
        plt.title(fig_title)
    plt.savefig(os.path.join(output_dir, "roc_curves.svg"),
                format="svg",
                dpi=dpi)


[docs]def visualize_precision_recall_curves(
        prediction,
        target,
        output_dir,
        report_gt_feature_n_positives=50,
        style="seaborn-colorblind",
        fig_title="Feature precision-recall curves",
        dpi=500):
    """
    Output the precision-recall (PR) curves for each feature predicted by
    a model as an SVG.

    Parameters
    ----------
    prediction : numpy.ndarray
        Value predicted by user model.
    target : numpy.ndarray
        True value that the user model was trying to predict.
    output_dir : str
        The path to the directory to output the figures. Directories that
        do not currently exist will be automatically created.
    report_gt_feature_n_positives : int, optional
        Default is 50. Do not visualize an PR curve for a feature with
        less than 50 positive examples in `target`.
    style : str, optional
        Default is "seaborn-colorblind". Specify a style available in
        `matplotlib.pyplot.style.available` to use.
    fig_title : str, optional
        Default is "Feature precision-recall curves". Set the figure title.
    dpi : int, optional
        Default is 500. Specify dots per inch (resolution) of the figure.

    Returns
    -------
    None
        Outputs the figure in `output_dir`.

    """
    os.makedirs(output_dir, exist_ok=True)

    # TODO: fix this
    import matplotlib
    backend = matplotlib.get_backend()
    if "inline" not in backend:
        matplotlib.use("SVG")
    import matplotlib.pyplot as plt

    plt.style.use(style)
    plt.figure()
    for index, feature_preds in enumerate(prediction.T):
        feature_targets = target[:, index]
        if len(np.unique(feature_targets)) > 1 and \
                np.sum(feature_targets) > report_gt_feature_n_positives:
            precision, recall, _ = precision_recall_curve(
                feature_targets, feature_preds)
            plt.step(
                recall, precision, 'r-',
                color="black", alpha=0.3, lw=1, where="post")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    if fig_title:
        plt.title(fig_title)
    plt.savefig(os.path.join(output_dir, "precision_recall_curves.svg"),
                format="svg",
                dpi=dpi)


def compute_score(prediction, target, metric_fn,
                  report_gt_feature_n_positives=10):
    """
    Using a user-specified metric, computes the distance between
    two tensors.

    Parameters
    ----------
    prediction : numpy.ndarray
        Value predicted by user model.
    target : numpy.ndarray
        True value that the user model was trying to predict.
    metric_fn : types.FunctionType
        A metric that can measure the distance between the prediction
        and target variables.
    report_gt_feature_n_positives : int, optional
        Default is 10. The minimum number of positive examples for a
        feature in order to compute the score for it.

    Returns
    -------
    average_score, feature_scores : tuple(float, numpy.ndarray)
        A tuple containing the average of all feature scores, and a
        vector containing the scores for each feature. If there were
        no features meeting our filtering thresholds, will return
        `(None, [])`.
    """
    feature_scores = np.ones(target.shape[1]) * np.nan
    # Deal with the case of multi-class classification, where each example only has one target value but multiple prediction values
    if target.shape[1] == 1 and prediction.shape[1] > 1:
        prediction = [prediction]
    else:
        prediction = prediction.T
    for index, feature_preds in enumerate(prediction):
        feature_targets = target[:, index]
        if len(np.unique(feature_targets)) > 0 and \
               np.count_nonzero(feature_targets) > report_gt_feature_n_positives:
            try:
                feature_scores[index] = metric_fn(
                    feature_targets, feature_preds)
            except ValueError:  # do I need to make this more generic?
                continue
    valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values.
    if not valid_feature_scores:
        return None, feature_scores
    average_score = np.average(valid_feature_scores)
    return average_score, feature_scores


def get_feature_specific_scores(data, get_feature_from_index_fn):
    """
    Generates a dictionary mapping feature names to feature scores from
    an intermediate representation.

    Parameters
    ----------
    data : list(tuple(int, float))
        A list of tuples, where each tuple contains a feature's index
        and the score for that feature.
    get_feature_from_index_fn : types.FunctionType
        A function that takes an index (`int`) and returns a feature
        name (`str`).

    Returns
    -------
    dict
        A dictionary mapping feature names (`str`) to scores (`float`).
        If there was no score for a feature, its score will be set to
        `None`.

    """
    feature_score_dict = {}
    for index, score in enumerate(data):
        feature = get_feature_from_index_fn(index)
        if not np.isnan(score):
            feature_score_dict[feature] = score
        else:
            feature_score_dict[feature] = None
    return feature_score_dict


def auc_u_test(labels, predictions):
    """
    Outputs the area under the the ROC curve associated with a certain
    set of labels and the predictions given by the training model.
    Computed from the U statistic.

    Parameters
    ----------
    labels: numpy.ndarray
        Known labels of values predicted by model. Must be one dimensional.
    predictions: numpy.ndarray
        Value predicted by user model. Must be one dimensional, with matching
        dimension to `labels`

    Returns
    -------
    float
        AUC value of given label, prediction pairs

    """
    len_pos = int(np.sum(labels))
    len_neg = len(labels) - len_pos
    rank_sum = np.sum(rankdata(predictions)[labels == 1])
    u_value = rank_sum - (len_pos * (len_pos + 1)) / 2
    auc = u_value / (len_pos * len_neg)
    return auc


[docs]class PerformanceMetrics(object):
    """
    Tracks and calculates metrics to evaluate how closely a model's
    predictions match the true values it was designed to predict.

    Parameters
    ----------
    get_feature_from_index_fn : types.FunctionType
        A function that takes an index (`int`) and returns a feature
        name (`str`).
    report_gt_feature_n_positives : int, optional
        Default is 10. The minimum number of positive examples for a
        feature in order to compute the score for it.
    metrics : dict
        A dictionary that maps metric names (`str`) to metric functions.
        By default, this contains `"roc_auc"`, which maps to
        `sklearn.metrics.roc_auc_score`, and `"average_precision"`,
        which maps to `sklearn.metrics.average_precision_score`.



    Attributes
    ----------
    skip_threshold : int
        The minimum number of positive examples of a feature that must
        be included in an update for a metric score to be
        calculated for it.
    get_feature_from_index : types.FunctionType
        A function that takes an index (`int`) and returns a feature
        name (`str`).
    metrics : dict
        A dictionary that maps metric names (`str`) to metric objects
        (`Metric`). By default, this contains `"roc_auc"` and
        `"average_precision"`.

    """

    def __init__(self,
                 get_feature_from_index_fn,
                 report_gt_feature_n_positives=10,
                 metrics=dict(roc_auc=roc_auc_score,
                              average_precision=average_precision_score)):
        """
        Creates a new object of the `PerformanceMetrics` class.
        """
        self.skip_threshold = report_gt_feature_n_positives
        self.get_feature_from_index = get_feature_from_index_fn
        self.metrics = dict()
        for k, v in metrics.items():
            self.metrics[k] = Metric(fn=v, data=[])

[docs]    def add_metric(self, name, metric_fn):
        """
        Begins tracking of the specified metric.

        Parameters
        ----------
        name : str
            The name of the metric.
        metric_fn : types.FunctionType
            A metric function.

        """
        self.metrics[name] = Metric(fn=metric_fn, data=[])

[docs]    def remove_metric(self, name):
        """
        Ends the tracking of the specified metric, and returns the
        previous scores associated with that metric.

        Parameters
        ----------
        name : str
            The name of the metric.

        Returns
        -------
        list(float)
            The list of feature-specific scores obtained by previous
            uses of the specified metric.

        """
        data = self.metrics[name].data
        del self.metrics[name]
        return data

[docs]    def update(self, prediction, target):
        """
        Evaluates the tracked metrics on a model prediction and its
        target value, and adds this to the metric histories.

        Parameters
        ----------
        prediction : numpy.ndarray
            Value predicted by user model.
        target : numpy.ndarray
            True value that the user model was trying to predict.

        Returns
        -------
        dict
            A dictionary mapping each metric names (`str`) to the
            average score of that metric across all features
            (`float`).

        """
        metric_scores = {}
        for name, metric in self.metrics.items():
            avg_score, feature_scores = compute_score(
                prediction, target, metric.fn,
                report_gt_feature_n_positives=self.skip_threshold)
            metric.data.append(feature_scores)
            metric_scores[name] = avg_score
        return metric_scores

[docs]    def visualize(self, prediction, target, output_dir, **kwargs):
        """
        Outputs ROC and PR curves. Does not support other metrics
        currently.

        Parameters
        ----------
        prediction : numpy.ndarray
            Value predicted by user model.
        target : numpy.ndarray
            True value that the user model was trying to predict.
        output_dir : str
            The path to the directory to output the figures. Directories that
            do not currently exist will be automatically created.
        **kwargs : dict
            Keyword arguments to pass to each visualization function. Each
            function accepts the following args:

                * style : str - Default is "seaborn-colorblind". Specify a \
                          style available in \
                          `matplotlib.pyplot.style.available` to use.
                * dpi : int - Default is 500. Specify dots per inch \
                              (resolution) of the figure.

        Returns
        -------
        None
            Outputs figures to `output_dir`.

        """
        os.makedirs(output_dir, exist_ok=True)
        if "roc_auc" in self.metrics:
            visualize_roc_curves(
                prediction, target, output_dir,
                report_gt_feature_n_positives=self.skip_threshold,
                **kwargs)
        if "average_precision" in self.metrics:
            visualize_precision_recall_curves(
                prediction, target, output_dir,
                report_gt_feature_n_positives=self.skip_threshold,
                **kwargs)

[docs]    def write_feature_scores_to_file(self, output_path):
        """
        Writes each metric's score for each feature to a specified
        file.

        Parameters
        ----------
        output_path : str
            The path to the output file where performance metrics will
            be written.

        Returns
        -------
        dict
            A dictionary mapping feature names (`str`) to
            sub-dictionaries (`dict`). Each sub-dictionary then maps
            metric names (`str`) to the score for that metric on the
            given feature. If a metric was not evaluated on a given
            feature, the score will be `None`.

        """
        feature_scores = defaultdict(dict)
        for name, metric in self.metrics.items():
            feature_score_dict = get_feature_specific_scores(
                metric.data[-1], self.get_feature_from_index)
            for feature, score in feature_score_dict.items():
                if score is None:
                    feature_scores[feature] = None
                else:
                    feature_scores[feature][name] = score

        metric_cols = [m for m in self.metrics.keys()]
        cols = '\t'.join(["class"] + metric_cols)
        with open(output_path, 'w+') as file_handle:
            file_handle.write("{0}\n".format(cols))
            for feature, metric_scores in feature_scores.items():
                if not metric_scores:
                    file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
                else:
                    metric_score_cols = '\t'.join(
                        ["{0:.4f}".format(s) for s in metric_scores.values()])
                    file_handle.write("{0}\t{1}\n".format(feature,
                                                          metric_score_cols))
        return feature_scores