Source code for selene_sdk.utils.performance_metrics
"""
This module provides the `PerformanceMetrics` class and supporting
functionality for tracking and computing model performance.
"""
from collections import defaultdict, namedtuple
import logging
import os
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from scipy.stats import rankdata
logger = logging.getLogger("selene")
Metric = namedtuple("Metric", ["fn", "data"])
"""
A tuple containing a metric function and the results from applying that
metric to some values.
Parameters
----------
fn : types.FunctionType
A metric.
data : list(float)
A list holding the results from applying the metric.
Attributes
----------
fn : types.FunctionType
A metric.
data : list(float)
A list holding the results from applying the metric.
"""
[docs]def visualize_roc_curves(prediction,
target,
output_dir,
report_gt_feature_n_positives=50,
style="seaborn-colorblind",
fig_title="Feature ROC curves",
dpi=500):
"""
Output the ROC curves for each feature predicted by a model
as an SVG.
Parameters
----------
prediction : numpy.ndarray
Value predicted by user model.
target : numpy.ndarray
True value that the user model was trying to predict.
output_dir : str
The path to the directory to output the figures. Directories that
do not currently exist will be automatically created.
report_gt_feature_n_positives : int, optional
Default is 50. Do not visualize an ROC curve for a feature with
less than 50 positive examples in `target`.
style : str, optional
Default is "seaborn-colorblind". Specify a style available in
`matplotlib.pyplot.style.available` to use.
fig_title : str, optional
Default is "Feature ROC curves". Set the figure title.
dpi : int, optional
Default is 500. Specify dots per inch (resolution) of the figure.
Returns
-------
None
Outputs the figure in `output_dir`.
"""
os.makedirs(output_dir, exist_ok=True)
import matplotlib
backend = matplotlib.get_backend()
if "inline" not in backend:
matplotlib.use("SVG")
import matplotlib.pyplot as plt
plt.style.use(style)
plt.figure()
for index, feature_preds in enumerate(prediction.T):
feature_targets = target[:, index]
if len(np.unique(feature_targets)) > 1 and \
np.sum(feature_targets) > report_gt_feature_n_positives:
fpr, tpr, _ = roc_curve(feature_targets, feature_preds)
plt.plot(fpr, tpr, 'r-', color="black", alpha=0.3, lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
if fig_title:
plt.title(fig_title)
plt.savefig(os.path.join(output_dir, "roc_curves.svg"),
format="svg",
dpi=dpi)
[docs]def visualize_precision_recall_curves(
prediction,
target,
output_dir,
report_gt_feature_n_positives=50,
style="seaborn-colorblind",
fig_title="Feature precision-recall curves",
dpi=500):
"""
Output the precision-recall (PR) curves for each feature predicted by
a model as an SVG.
Parameters
----------
prediction : numpy.ndarray
Value predicted by user model.
target : numpy.ndarray
True value that the user model was trying to predict.
output_dir : str
The path to the directory to output the figures. Directories that
do not currently exist will be automatically created.
report_gt_feature_n_positives : int, optional
Default is 50. Do not visualize an PR curve for a feature with
less than 50 positive examples in `target`.
style : str, optional
Default is "seaborn-colorblind". Specify a style available in
`matplotlib.pyplot.style.available` to use.
fig_title : str, optional
Default is "Feature precision-recall curves". Set the figure title.
dpi : int, optional
Default is 500. Specify dots per inch (resolution) of the figure.
Returns
-------
None
Outputs the figure in `output_dir`.
"""
os.makedirs(output_dir, exist_ok=True)
# TODO: fix this
import matplotlib
backend = matplotlib.get_backend()
if "inline" not in backend:
matplotlib.use("SVG")
import matplotlib.pyplot as plt
plt.style.use(style)
plt.figure()
for index, feature_preds in enumerate(prediction.T):
feature_targets = target[:, index]
if len(np.unique(feature_targets)) > 1 and \
np.sum(feature_targets) > report_gt_feature_n_positives:
precision, recall, _ = precision_recall_curve(
feature_targets, feature_preds)
plt.step(
recall, precision, 'r-',
color="black", alpha=0.3, lw=1, where="post")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
if fig_title:
plt.title(fig_title)
plt.savefig(os.path.join(output_dir, "precision_recall_curves.svg"),
format="svg",
dpi=dpi)
def compute_score(prediction, target, metric_fn,
report_gt_feature_n_positives=10):
"""
Using a user-specified metric, computes the distance between
two tensors.
Parameters
----------
prediction : numpy.ndarray
Value predicted by user model.
target : numpy.ndarray
True value that the user model was trying to predict.
metric_fn : types.FunctionType
A metric that can measure the distance between the prediction
and target variables.
report_gt_feature_n_positives : int, optional
Default is 10. The minimum number of positive examples for a
feature in order to compute the score for it.
Returns
-------
average_score, feature_scores : tuple(float, numpy.ndarray)
A tuple containing the average of all feature scores, and a
vector containing the scores for each feature. If there were
no features meeting our filtering thresholds, will return
`(None, [])`.
"""
feature_scores = np.ones(target.shape[1]) * np.nan
# Deal with the case of multi-class classification, where each example only has one target value but multiple prediction values
if target.shape[1] == 1 and prediction.shape[1] > 1:
prediction = [prediction]
else:
prediction = prediction.T
for index, feature_preds in enumerate(prediction):
feature_targets = target[:, index]
if len(np.unique(feature_targets)) > 0 and \
np.count_nonzero(feature_targets) > report_gt_feature_n_positives:
try:
feature_scores[index] = metric_fn(
feature_targets, feature_preds)
except ValueError: # do I need to make this more generic?
continue
valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values.
if not valid_feature_scores:
return None, feature_scores
average_score = np.average(valid_feature_scores)
return average_score, feature_scores
def get_feature_specific_scores(data, get_feature_from_index_fn):
"""
Generates a dictionary mapping feature names to feature scores from
an intermediate representation.
Parameters
----------
data : list(tuple(int, float))
A list of tuples, where each tuple contains a feature's index
and the score for that feature.
get_feature_from_index_fn : types.FunctionType
A function that takes an index (`int`) and returns a feature
name (`str`).
Returns
-------
dict
A dictionary mapping feature names (`str`) to scores (`float`).
If there was no score for a feature, its score will be set to
`None`.
"""
feature_score_dict = {}
for index, score in enumerate(data):
feature = get_feature_from_index_fn(index)
if not np.isnan(score):
feature_score_dict[feature] = score
else:
feature_score_dict[feature] = None
return feature_score_dict
def auc_u_test(labels, predictions):
"""
Outputs the area under the the ROC curve associated with a certain
set of labels and the predictions given by the training model.
Computed from the U statistic.
Parameters
----------
labels: numpy.ndarray
Known labels of values predicted by model. Must be one dimensional.
predictions: numpy.ndarray
Value predicted by user model. Must be one dimensional, with matching
dimension to `labels`
Returns
-------
float
AUC value of given label, prediction pairs
"""
len_pos = int(np.sum(labels))
len_neg = len(labels) - len_pos
rank_sum = np.sum(rankdata(predictions)[labels == 1])
u_value = rank_sum - (len_pos * (len_pos + 1)) / 2
auc = u_value / (len_pos * len_neg)
return auc
[docs]class PerformanceMetrics(object):
"""
Tracks and calculates metrics to evaluate how closely a model's
predictions match the true values it was designed to predict.
Parameters
----------
get_feature_from_index_fn : types.FunctionType
A function that takes an index (`int`) and returns a feature
name (`str`).
report_gt_feature_n_positives : int, optional
Default is 10. The minimum number of positive examples for a
feature in order to compute the score for it.
metrics : dict
A dictionary that maps metric names (`str`) to metric functions.
By default, this contains `"roc_auc"`, which maps to
`sklearn.metrics.roc_auc_score`, and `"average_precision"`,
which maps to `sklearn.metrics.average_precision_score`.
Attributes
----------
skip_threshold : int
The minimum number of positive examples of a feature that must
be included in an update for a metric score to be
calculated for it.
get_feature_from_index : types.FunctionType
A function that takes an index (`int`) and returns a feature
name (`str`).
metrics : dict
A dictionary that maps metric names (`str`) to metric objects
(`Metric`). By default, this contains `"roc_auc"` and
`"average_precision"`.
"""
def __init__(self,
get_feature_from_index_fn,
report_gt_feature_n_positives=10,
metrics=dict(roc_auc=roc_auc_score,
average_precision=average_precision_score)):
"""
Creates a new object of the `PerformanceMetrics` class.
"""
self.skip_threshold = report_gt_feature_n_positives
self.get_feature_from_index = get_feature_from_index_fn
self.metrics = dict()
for k, v in metrics.items():
self.metrics[k] = Metric(fn=v, data=[])
[docs] def add_metric(self, name, metric_fn):
"""
Begins tracking of the specified metric.
Parameters
----------
name : str
The name of the metric.
metric_fn : types.FunctionType
A metric function.
"""
self.metrics[name] = Metric(fn=metric_fn, data=[])
[docs] def remove_metric(self, name):
"""
Ends the tracking of the specified metric, and returns the
previous scores associated with that metric.
Parameters
----------
name : str
The name of the metric.
Returns
-------
list(float)
The list of feature-specific scores obtained by previous
uses of the specified metric.
"""
data = self.metrics[name].data
del self.metrics[name]
return data
[docs] def update(self, prediction, target):
"""
Evaluates the tracked metrics on a model prediction and its
target value, and adds this to the metric histories.
Parameters
----------
prediction : numpy.ndarray
Value predicted by user model.
target : numpy.ndarray
True value that the user model was trying to predict.
Returns
-------
dict
A dictionary mapping each metric names (`str`) to the
average score of that metric across all features
(`float`).
"""
metric_scores = {}
for name, metric in self.metrics.items():
avg_score, feature_scores = compute_score(
prediction, target, metric.fn,
report_gt_feature_n_positives=self.skip_threshold)
metric.data.append(feature_scores)
metric_scores[name] = avg_score
return metric_scores
[docs] def visualize(self, prediction, target, output_dir, **kwargs):
"""
Outputs ROC and PR curves. Does not support other metrics
currently.
Parameters
----------
prediction : numpy.ndarray
Value predicted by user model.
target : numpy.ndarray
True value that the user model was trying to predict.
output_dir : str
The path to the directory to output the figures. Directories that
do not currently exist will be automatically created.
**kwargs : dict
Keyword arguments to pass to each visualization function. Each
function accepts the following args:
* style : str - Default is "seaborn-colorblind". Specify a \
style available in \
`matplotlib.pyplot.style.available` to use.
* dpi : int - Default is 500. Specify dots per inch \
(resolution) of the figure.
Returns
-------
None
Outputs figures to `output_dir`.
"""
os.makedirs(output_dir, exist_ok=True)
if "roc_auc" in self.metrics:
visualize_roc_curves(
prediction, target, output_dir,
report_gt_feature_n_positives=self.skip_threshold,
**kwargs)
if "average_precision" in self.metrics:
visualize_precision_recall_curves(
prediction, target, output_dir,
report_gt_feature_n_positives=self.skip_threshold,
**kwargs)
[docs] def write_feature_scores_to_file(self, output_path):
"""
Writes each metric's score for each feature to a specified
file.
Parameters
----------
output_path : str
The path to the output file where performance metrics will
be written.
Returns
-------
dict
A dictionary mapping feature names (`str`) to
sub-dictionaries (`dict`). Each sub-dictionary then maps
metric names (`str`) to the score for that metric on the
given feature. If a metric was not evaluated on a given
feature, the score will be `None`.
"""
feature_scores = defaultdict(dict)
for name, metric in self.metrics.items():
feature_score_dict = get_feature_specific_scores(
metric.data[-1], self.get_feature_from_index)
for feature, score in feature_score_dict.items():
if score is None:
feature_scores[feature] = None
else:
feature_scores[feature][name] = score
metric_cols = [m for m in self.metrics.keys()]
cols = '\t'.join(["class"] + metric_cols)
with open(output_path, 'w+') as file_handle:
file_handle.write("{0}\n".format(cols))
for feature, metric_scores in feature_scores.items():
if not metric_scores:
file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols))))
else:
metric_score_cols = '\t'.join(
["{0:.4f}".format(s) for s in metric_scores.values()])
file_handle.write("{0}\t{1}\n".format(feature,
metric_score_cols))
return feature_scores