import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from .base import BaseThresholder
from .thresh_utility import cut
[docs]
class CHAU(BaseThresholder):
r"""CHAU class for Chauvenet's criterion thresholder.
Use the Chauvenet's criterion to evaluate a non-parametric
means to threshold scores generated by the decision_scores
where outliers are set to any value below the Chauvenet's
criterion. See :cite:`bolshev2016chau` for details
Parameters
----------
method : {'classic', 'effective'}, optional (default='effective')
Determines how the threshold is computed:
- 'classic': Uses the classical Chauvenet's criterion based on all samples.
- 'effective': Uses an entropy-based effective sample size to adjust the threshold.
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The classical Chauvenet's criterion identifies outliers in a dataset
by computing a threshold based on the z-score of each observation:
.. math::
Z = \frac{x - \bar{x}}{\sigma} \mathrm{,}
where :math:`\bar{x}` is the mean and :math:`\sigma` the standard deviation
of the dataset. An observation is considered an outlier if the probability
of obtaining a value at least as extreme is less than
.. math::
P_z = \frac{1}{2N} \mathrm{,}
with :math:`N` being the total number of samples. The corresponding z-score
threshold is then given by the inverse survival function of the standard
normal distribution:
.. math::
Z_\mathrm{crit} = \mathrm{norm.isf}(P_z)
Any observation with :math:`|Z| > Z_\mathrm{crit}` is flagged as an outlier.
In the 'effective' method, the classical threshold is adjusted by an
entropy-based effective sample size. This accounts for situations where
the dataset may contain correlated or redundant samples, reducing the
effective number of independent observations. The effective sample size
:math:`N_\mathrm{eff}` is estimated as
.. math::
N_\mathrm{eff} = \min(N, \exp(H)) \mathrm{,}
where :math:`H` is the entropy of the histogram of standardized scores.
The threshold probability is then
.. math::
P_z = \frac{1}{2 N_\mathrm{eff}}
which typically results in a more conservative threshold that adapts to
the actual variability and redundancy in the data.
"""
def __init__(self, method="effective", random_state=1234):
super().__init__()
self.method = method
self.random_state = random_state
np.random.seed(random_state)
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = self._data_setup(decision)
scaler = StandardScaler()
z = scaler.fit_transform(decision.reshape(-1, 1))
if self.method == "classic":
N = len(z)
elif self.method == "effective":
N = self._effective_sample_size_entropy(z)
Pz = 1 / (2 * N)
zcrit = stats.norm.isf(Pz)
zcrit = scaler.inverse_transform(np.array([[zcrit]]))[0, 0]
self.thresh_ = zcrit
return cut(decision, zcrit)
def _effective_sample_size_entropy(self, x):
"""
Entropy-based effective sample size.
Parameters
----------
x : np.ndarray
1D array of outlier likelihood scores
Returns
-------
Neff : float
Entropy-based effective sample size
"""
x = np.asarray(x)
N = len(x)
hist, _ = np.histogram(x, bins="auto", density=False)
hist = hist.astype(float)
p = hist / hist.sum()
p = p[p > 0] # avoid log(0)
H = -np.sum(p * np.log(p))
Neff = np.exp(H)
return min(N, Neff)