import numpy as np
import scipy.stats as stats
from scipy.special import erfc
from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize
[docs]
class CHAU(BaseThresholder):
r"""CHAU class for Chauvenet's criterion thresholder.
Use the Chauvenet's criterion to evaluate a non-parametric
means to threshold scores generated by the decision_scores
where outliers are set to any value below the Chauvenet's
criterion. See :cite:`bolshev2016chau` for details
Parameters
----------
method : {'mean', 'median', 'gmean'}, optional (default='mean')
Calculate the area normal to distance using a scaler
- 'mean': Construct a scaler with the the mean of the scores
- 'median: Construct a scaler with the the median of the scores
- 'gmean': Construct a scaler with the geometric mean of the scores
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The Chauvenet's criterion for a one tail of a distribution is defined
as follows:
.. math::
D_{\mathrm{max}}>Z \mathrm{,}
where :math:`D_{\mathrm{max}}` is the bounds of the probability band
around the mean given by,
.. math::
D_{\mathrm{max}} = \lvert norm.ppf(Pz) \rvert \mathrm{,}
where this bounds is equal to the inverse of a cumulative distribution function
for a probability of one of the tails of the normal distribution, and :math:`P_z`
is therefore defined as,
.. math::
P_z = \frac{1}{4n} \mathrm{,}
with :math:`n` being the number of samples in the decision scores. Finally the z-score
can be calculated as follows:
.. math::
Z = \frac{x-\bar{x}}{\sigma} \mathrm{,}
with :math:`\bar{x}` as the mean and :math:`\sigma` the standard deviation
of the decision scores.
CHAU employs variants of the classical Chauvenet's criterion as the mean can be
replaced with the geometric mean or the median.
Any z-score greater than the Chauvenet's criterion is considered an outlier.
"""
def __init__(self, method='mean', random_state=1234):
super().__init__()
stat = {'mean': np.mean, 'median': np.median, 'gmean': stats.gmean}
self.method = stat[method]
self.random_state = random_state
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = check_scores(decision, random_state=self.random_state)
decision = normalize(decision)
self.dscores_ = decision
# Calculate Chauvenet's criterion for one tail
Pz = 1/(4*len(decision))
criterion = 1/abs(stats.norm.ppf(Pz))
# Get area normal to distance
prob = erfc(np.abs(decision-self.method(decision)) /
decision.std()/2.0**0.5)
self.thresh_ = criterion * (1-np.min(prob))/np.max(prob)
return 1-cut(prob, criterion)