Source code for pythresh.thresholds.chau

import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler

from .base import BaseThresholder
from .thresh_utility import cut


[docs] class CHAU(BaseThresholder): r"""CHAU class for Chauvenet's criterion thresholder. Use the Chauvenet's criterion to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value below the Chauvenet's criterion. See :cite:`bolshev2016chau` for details Parameters ---------- method : {'classic', 'effective'}, optional (default='effective') Determines how the threshold is computed: - 'classic': Uses the classical Chauvenet's criterion based on all samples. - 'effective': Uses an entropy-based effective sample size to adjust the threshold. random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- The classical Chauvenet's criterion identifies outliers in a dataset by computing a threshold based on the z-score of each observation: .. math:: Z = \frac{x - \bar{x}}{\sigma} \mathrm{,} where :math:`\bar{x}` is the mean and :math:`\sigma` the standard deviation of the dataset. An observation is considered an outlier if the probability of obtaining a value at least as extreme is less than .. math:: P_z = \frac{1}{2N} \mathrm{,} with :math:`N` being the total number of samples. The corresponding z-score threshold is then given by the inverse survival function of the standard normal distribution: .. math:: Z_\mathrm{crit} = \mathrm{norm.isf}(P_z) Any observation with :math:`|Z| > Z_\mathrm{crit}` is flagged as an outlier. In the 'effective' method, the classical threshold is adjusted by an entropy-based effective sample size. This accounts for situations where the dataset may contain correlated or redundant samples, reducing the effective number of independent observations. The effective sample size :math:`N_\mathrm{eff}` is estimated as .. math:: N_\mathrm{eff} = \min(N, \exp(H)) \mathrm{,} where :math:`H` is the entropy of the histogram of standardized scores. The threshold probability is then .. math:: P_z = \frac{1}{2 N_\mathrm{eff}} which typically results in a more conservative threshold that adapts to the actual variability and redundancy in the data. """ def __init__(self, method="effective", random_state=1234): super().__init__() self.method = method self.random_state = random_state np.random.seed(random_state)
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = self._data_setup(decision) scaler = StandardScaler() z = scaler.fit_transform(decision.reshape(-1, 1)) if self.method == "classic": N = len(z) elif self.method == "effective": N = self._effective_sample_size_entropy(z) Pz = 1 / (2 * N) zcrit = stats.norm.isf(Pz) zcrit = scaler.inverse_transform(np.array([[zcrit]]))[0, 0] self.thresh_ = zcrit return cut(decision, zcrit)
def _effective_sample_size_entropy(self, x): """ Entropy-based effective sample size. Parameters ---------- x : np.ndarray 1D array of outlier likelihood scores Returns ------- Neff : float Entropy-based effective sample size """ x = np.asarray(x) N = len(x) hist, _ = np.histogram(x, bins="auto", density=False) hist = hist.astype(float) p = hist / hist.sum() p = p[p > 0] # avoid log(0) H = -np.sum(p * np.log(p)) Neff = np.exp(H) return min(N, Neff)