Source code for sklearn.metrics.ranking

"""Metrics to assess performance on classification task given scores

Functions named as ``*_score`` return a scalar value to maximize: the higher
the better

Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
the lower the better
"""

# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#          Mathieu Blondel <mathieu@mblondel.org>
#          Olivier Grisel <olivier.grisel@ensta.org>
#          Arnaud Joly <a.joly@ulg.ac.be>
#          Jochen Wersdorfer <jochen@wersdoerfer.de>
#          Lars Buitinck <L.J.Buitinck@uva.nl>
#          Joel Nothman <joel.nothman@gmail.com>
#          Noel Dawe <noel@dawe.me>
# License: BSD 3 clause

from __future__ import division

import warnings
import numpy as np
from scipy.sparse import csr_matrix

from ..utils import check_consistent_length
from ..utils import column_or_1d, check_array
from ..utils.multiclass import type_of_target
from ..utils.fixes import isclose
from ..utils.stats import rankdata

from .base import _average_binary_score
from .base import UndefinedMetricWarning


[docs]def auc(x, y, reorder=False): """Compute Area Under the Curve (AUC) using the trapezoidal rule This is a general function, given points on a curve. For computing the area under the ROC-curve, see :func:`roc_auc_score`. Parameters ---------- x : array, shape = [n] x coordinates. y : array, shape = [n] y coordinates. reorder : boolean, optional (default=False) If True, assume that the curve is ascending in the case of ties, as for an ROC curve. If the curve is non-ascending, the result will be wrong. Returns ------- auc : float Examples -------- >>> import numpy as np >>> from sklearn import metrics >>> y = np.array([1, 1, 2, 2]) >>> pred = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) >>> metrics.auc(fpr, tpr) 0.75 See also -------- roc_auc_score : Computes the area under the ROC curve precision_recall_curve : Compute precision-recall pairs for different probability thresholds """ check_consistent_length(x, y) x = column_or_1d(x) y = column_or_1d(y) if x.shape[0] < 2: raise ValueError('At least 2 points are needed to compute' ' area under curve, but x.shape = %s' % x.shape) direction = 1 if reorder: # reorder the data points according to the x axis and using y to # break ties order = np.lexsort((y, x)) x, y = x[order], y[order] else: dx = np.diff(x) if np.any(dx < 0): if np.all(dx <= 0): direction = -1 else: raise ValueError("Reordering is not turned on, and " "the x array is not increasing: %s" % x) area = direction * np.trapz(y, x) return area
def average_precision_score(y_true, y_score, average="macro", sample_weight=None): """Compute average precision (AP) from prediction scores This score corresponds to the area under the precision-recall curve. Note: this implementation is restricted to the binary classification task or multilabel classification task. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- average_precision : float References ---------- .. [1] `Wikipedia entry for the Average precision <http://en.wikipedia.org/wiki/Average_precision>`_ See also -------- roc_auc_score : Area under the ROC curve precision_recall_curve : Compute precision-recall pairs for different probability thresholds Examples -------- >>> import numpy as np >>> from sklearn.metrics import average_precision_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS 0.79... """ def _binary_average_precision(y_true, y_score, sample_weight=None): precision, recall, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(recall, precision) return _average_binary_score(_binary_average_precision, y_true, y_score, average, sample_weight=sample_weight) def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): """Compute Area Under the Curve (AUC) from prediction scores Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels in binary label indicators. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- auc : float References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ See also -------- average_precision_score : Area under the precision-recall curve roc_curve : Compute Receiver operating characteristic (ROC) Examples -------- >>> import numpy as np >>> from sklearn.metrics import roc_auc_score >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> roc_auc_score(y_true, y_scores) 0.75 """ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): if len(np.unique(y_true)) != 2: raise ValueError("Only one class present in y_true. ROC AUC score " "is not defined in that case.") fpr, tpr, tresholds = roc_curve(y_true, y_score, sample_weight=sample_weight) return auc(fpr, tpr, reorder=True) return _average_binary_score( _binary_roc_auc_score, y_true, y_score, average, sample_weight=sample_weight) def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): """Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification y_score : array, shape = [n_samples] Estimated probabilities or decision function pos_label : int, optional (default=None) The label of the positive class sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- fps : array, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : array, shape = [n_thresholds := len(np.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : array, shape = [n_thresholds] Decreasing score values. """ check_consistent_length(y_true, y_score) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified classes = np.unique(y_true) if (pos_label is None and not (np.all(classes == [0, 1]) or np.all(classes == [-1, 1]) or np.all(classes == [0]) or np.all(classes == [-1]) or np.all(classes == [1]))): raise ValueError("Data is not binary and pos_label is not specified") elif pos_label is None: pos_label = 1. # make y_true a boolean vector y_true = (y_true == pos_label) # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] if sample_weight is not None: weight = sample_weight[desc_score_indices] else: weight = 1. # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. # We need to use isclose to avoid spurious repeated thresholds # stemming from floating point roundoff errors. distinct_value_indices = np.where(np.logical_not(isclose( np.diff(y_score), 0)))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = (y_true * weight).cumsum()[threshold_idxs] if sample_weight is not None: fps = weight.cumsum()[threshold_idxs] - tps else: fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs] def precision_recall_curve(y_true, probas_pred, pos_label=None, sample_weight=None): """Compute precision-recall pairs for different probability thresholds Note: this implementation is restricted to the binary classification task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The last precision and recall values are 1. and 0. respectively and do not have a corresponding threshold. This ensures that the graph starts on the x axis. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification in range {-1, 1} or {0, 1}. probas_pred : array, shape = [n_samples] Estimated probabilities or decision function. pos_label : int, optional (default=None) The label of the positive class sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- precision : array, shape = [n_thresholds + 1] Precision values such that element i is the precision of predictions with score >= thresholds[i] and the last element is 1. recall : array, shape = [n_thresholds + 1] Decreasing recall values such that element i is the recall of predictions with score >= thresholds[i] and the last element is 0. thresholds : array, shape = [n_thresholds := len(np.unique(probas_pred))] Increasing thresholds on the decision function used to compute precision and recall. Examples -------- >>> import numpy as np >>> from sklearn.metrics import precision_recall_curve >>> y_true = np.array([0, 0, 1, 1]) >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> precision, recall, thresholds = precision_recall_curve( ... y_true, y_scores) >>> precision # doctest: +ELLIPSIS array([ 0.66..., 0.5 , 1. , 1. ]) >>> recall array([ 1. , 0.5, 0.5, 0. ]) >>> thresholds array([ 0.35, 0.4 , 0.8 ]) """ fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, pos_label=pos_label, sample_weight=sample_weight) precision = tps / (tps + fps) recall = tps / tps[-1] # stop when full recall attained # and reverse the outputs so recall is decreasing last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] def roc_curve(y_true, y_score, pos_label=None, sample_weight=None): """Compute Receiver operating characteristic (ROC) Note: this implementation is restricted to the binary classification task. Parameters ---------- y_true : array, shape = [n_samples] True binary labels in range {0, 1} or {-1, 1}. If labels are not binary, pos_label should be explicitly given. y_score : array, shape = [n_samples] Target scores, can either be probability estimates of the positive class or confidence values. pos_label : int Label considered as positive and others are considered negative. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- fpr : array, shape = [>2] Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i]. tpr : array, shape = [>2] Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i]. thresholds : array, shape = [n_thresholds] Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. See also -------- roc_auc_score : Compute Area Under the Curve (AUC) from prediction scores Notes ----- Since the thresholds are sorted from low to high values, they are reversed upon returning them to ensure they correspond to both ``fpr`` and ``tpr``, which are sorted in reversed order during their calculation. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ Examples -------- >>> import numpy as np >>> from sklearn import metrics >>> y = np.array([1, 1, 2, 2]) >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) >>> fpr array([ 0. , 0.5, 0.5, 1. ]) >>> tpr array([ 0.5, 0.5, 1. , 1. ]) >>> thresholds array([ 0.8 , 0.4 , 0.35, 0.1 ]) """ fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) if tps.size == 0 or fps[0] != 0: # Add an extra threshold position if necessary tps = np.r_[0, tps] fps = np.r_[0, fps] thresholds = np.r_[thresholds[0] + 1, thresholds] if fps[-1] <= 0: warnings.warn("No negative samples in y_true, " "false positive value should be meaningless", UndefinedMetricWarning) fpr = np.repeat(np.nan, fps.shape) else: fpr = fps / fps[-1] if tps[-1] <= 0: warnings.warn("No positive samples in y_true, " "true positive value should be meaningless", UndefinedMetricWarning) tpr = np.repeat(np.nan, tps.shape) else: tpr = tps / tps[-1] return fpr, tpr, thresholds def label_ranking_average_precision_score(y_true, y_score): """Compute ranking-based average precision Label ranking average precision (LRAP) is the average over each ground truth label assigned to each sample, of the ratio of true vs. total labels with lower score. This metric is used in multilabel ranking problem, where the goal is to give better rank to the labels associated to each sample. The obtained score is always strictly greater than 0 and the best value is 1. Parameters ---------- y_true : array or sparse matrix, shape = [n_samples, n_labels] True binary labels in binary indicator format. y_score : array, shape = [n_samples, n_labels] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. Returns ------- score : float Examples -------- >>> import numpy as np >>> from sklearn.metrics import label_ranking_average_precision_score >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) >>> label_ranking_average_precision_score(y_true, y_score) \ # doctest: +ELLIPSIS 0.416... """ check_consistent_length(y_true, y_score) y_true = check_array(y_true, ensure_2d=False) y_score = check_array(y_score, ensure_2d=False) if y_true.shape != y_score.shape: raise ValueError("y_true and y_score have different shape") # Handle badly formated array and the degenerate case with one label y_type = type_of_target(y_true) if (y_type != "multilabel-indicator" and not (y_type == "binary" and y_true.ndim == 2)): raise ValueError("{0} format is not supported".format(y_type)) y_true = csr_matrix(y_true) y_score = -y_score n_samples, n_labels = y_true.shape out = 0. for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): relevant = y_true.indices[start:stop] if (relevant.size == 0 or relevant.size == n_labels): # If all labels are relevant or unrelevant, the score is also # equal to 1. The label ranking has no meaning. out += 1. continue scores_i = y_score[i] rank = rankdata(scores_i, 'max')[relevant] L = rankdata(scores_i[relevant], 'max') out += (L / rank).mean() return out / n_samples def coverage_error(y_true, y_score, sample_weight=None): """Coverage error measure Compute how far we need to go through the ranked scores to cover all true labels. The best value is equal to the average number of labels in ``y_true`` per sample. Ties in ``y_scores`` are broken by giving maximal rank that would have been assigned to all tied values. Parameters ---------- y_true : array, shape = [n_samples, n_labels] True binary labels in binary indicator format. y_score : array, shape = [n_samples, n_labels] Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- coverage_error : float References ---------- .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). Mining multi-label data. In Data mining and knowledge discovery handbook (pp. 667-685). Springer US. """ y_true = check_array(y_true, ensure_2d=False) y_score = check_array(y_score, ensure_2d=False) check_consistent_length(y_true, y_score, sample_weight) y_type = type_of_target(y_true) if y_type != "multilabel-indicator": raise ValueError("{0} format is not supported".format(y_type)) if y_true.shape != y_score.shape: raise ValueError("y_true and y_score have different shape") y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) coverage = (y_score >= y_min_relevant).sum(axis=1) coverage = coverage.filled(0) return np.average(coverage, weights=sample_weight)