Source code for pythresh.thresholds.chau

import numpy as np
import scipy.stats as stats
from scipy.special import erfc

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize


[docs] class CHAU(BaseThresholder): r"""CHAU class for Chauvenet's criterion thresholder. Use the Chauvenet's criterion to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value below the Chauvenet's criterion. See :cite:`bolshev2016chau` for details Parameters ---------- method : {'mean', 'median', 'gmean'}, optional (default='mean') Calculate the area normal to distance using a scaler - 'mean': Construct a scaler with the the mean of the scores - 'median: Construct a scaler with the the median of the scores - 'gmean': Construct a scaler with the geometric mean of the scores random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- The Chauvenet's criterion for a one tail of a distribution is defined as follows: .. math:: D_{\mathrm{max}}>Z \mathrm{,} where :math:`D_{\mathrm{max}}` is the bounds of the probability band around the mean given by, .. math:: D_{\mathrm{max}} = \lvert norm.ppf(Pz) \rvert \mathrm{,} where this bounds is equal to the inverse of a cumulative distribution function for a probability of one of the tails of the normal distribution, and :math:`P_z` is therefore defined as, .. math:: P_z = \frac{1}{4n} \mathrm{,} with :math:`n` being the number of samples in the decision scores. Finally the z-score can be calculated as follows: .. math:: Z = \frac{x-\bar{x}}{\sigma} \mathrm{,} with :math:`\bar{x}` as the mean and :math:`\sigma` the standard deviation of the decision scores. CHAU employs variants of the classical Chauvenet's criterion as the mean can be replaced with the geometric mean or the median. Any z-score greater than the Chauvenet's criterion is considered an outlier. """ def __init__(self, method='mean', random_state=1234): super().__init__() stat = {'mean': np.mean, 'median': np.median, 'gmean': stats.gmean} self.method = stat[method] self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Calculate Chauvenet's criterion for one tail Pz = 1/(4*len(decision)) criterion = 1/abs(stats.norm.ppf(Pz)) # Get area normal to distance prob = erfc(np.abs(decision-self.method(decision)) / decision.std()/2.0**0.5) self.thresh_ = criterion * (1-np.min(prob))/np.max(prob) return 1-cut(prob, criterion)