import numpy as np
import scipy.stats as stats
from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize
# https://github.com/bhattbhavesh91/outlier-detection-grubbs-test-and-generalized-esd-test-python/blob/master/generalized-esd-test-for-outliers.ipynb
[docs]
class GESD(BaseThresholder):
r"""GESD class for Generalized Extreme Studentized Deviate thresholder.
Use the generalized extreme studentized deviate to evaluate a
non-parametric means to threshold scores generated by the decision_scores
where outliers are set to any less than the smallest detected outlier.
See :cite:`alrawashdeh2021gesd` for details.
Parameters
----------
max_outliers : int, optional (default='auto')
maximum number of outliers that the dataset may have. Default sets
max_outliers to be half the size of the dataset
alpha : float, optional (default=0.05)
significance level
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The generalized extreme studentized deviate is defined for the
hypothesis:
H0: There are no outliers in the decision scores
Ha: There are up to r amount of outliers in the decision scores
The test statistic is given as,
.. math::
R_i = \frac{\mathrm{MAX} \left(x_i - \bar{x}\right)}{\sigma} \mathrm{,}
where :math:`\bar{x}` and :math:`\sigma` are the mean and standard
deviation of the decision scores respectively.
The GESD maximized :math:`R_i` value is removed from the decision scores and
the r test statistic is recomputed and is tested against the r critical
values:
.. math::
\lambda_i = \frac{(n-i)t_{p,n-i-1}}{\sqrt{\left(n-i-1+t_{p,n-i-1}^2\right)(n-i+1)}} \,\, i = 1,2,...,r \mathrm{,}
where :math:`t_{p,v}` is the :math:`p_{100}` for a t-distribution with
:math:`v` degrees of freedom such that for a selected significance level
:math:`\alpha`:
.. math::
p = 1-\frac{\alpha}{2(n-i+1)} \mathrm{.}
The threshold for the decision scores is set to the smallest score that
fulfills the condition :math:`R_i>\lambda_i`.
"""
def __init__(self, max_outliers='auto', alpha=0.05, random_state=1234):
self.max_outliers = max_outliers
self.alpha = alpha
self.random_state = random_state
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = check_scores(decision, random_state=self.random_state)
decision = normalize(decision)
self.dscores_ = decision
arr = decision.copy()
limit = 1.1
if self.max_outliers == 'auto':
self.max_outliers = len(decision) // 2
for _ in range(self.max_outliers):
Gc = self._calc_crit(len(arr), self.alpha)
Gs, max_index = self._grubbs_stat(arr)
if (Gs > Gc) & (arr[max_index] < limit):
limit = arr[max_index]
arr = np.delete(arr, max_index)
self.thresh_ = limit
return cut(decision, limit)
def _grubbs_stat(self, y):
dev = np.abs(y - y.mean())
max_ind = np.argmax(dev)
Gcal = np.max(dev) / y.std()
return Gcal, max_ind
def _calc_crit(self, size, alpha):
t_dist = stats.t.ppf(1 - alpha / (2 * size), size - 2)
numerator = (size - 1) * np.sqrt(np.square(t_dist))
denominator = np.sqrt(size) * np.sqrt(size - 2 + np.square(t_dist))
return numerator / denominator