Source code for pythresh.thresholds.aucp

import numpy as np
from sklearn.metrics import auc

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, gen_kde, normalize


[docs] class AUCP(BaseThresholder): r"""AUCP class for Area Under Curve Precentage thresholder. Use the area under the curve to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond where the auc of the kde is less than the (mean + abs(mean-median)) percent of the total kde auc. See :cite:`ren2018aucp` for details Parameters ---------- random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- The area under the curve (AUC) is defined as follows: .. math:: AUC = \mathrm{lim}_{x\rightarrow\inf} \sum_{i=1}^{n} f(x) \delta x \mathrm{,} where :math:`f(x)` is the curve and :math:`\delta x` is the incremental step size of the rectangles whose areas will be summed up. The AUCP method generates a curve using the pdf of the normalized decision scores over a range of 0-1. This is done with a kernel density estimation. The incremental size step is :math:`1/2n`, with :math:`n` being the number of points of the decision scores. The AUC is continuously calculated in steps from the left to right of the data range starting from 0. The stopping limit is set to :math:`\mathrm{lim} = \bar{x} + \lvert \bar{x}-\tilde{x} \rvert`, where :math:`\bar{x}` is the mean decision score, and :math:`\tilde{x}` is the median decision score. The first AUC that is greater than the total AUC of the pdf multiplied by the :math:`\mathrm{lim}` is set as the threshold between inliers and outliers. """ def __init__(self, random_state=1234): self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Generate KDE val, dat_range = gen_kde(decision, 0, 1, len(decision)*2) val = normalize(val) # Get the total area under the curve tot_area = auc(dat_range, val) # Get area percentage limit mean = np.mean(decision) perc = mean+abs(mean-np.median(decision)) # Apply the limit to where the area is less than that limit percentage # of the total area under the curve limit = 1 for i in range(len(dat_range)): splt_area = auc(dat_range[i:], val[i:]) if splt_area < perc*tot_area: limit = dat_range[i] break self.thresh_ = limit return cut(decision, limit)