import numpy as np
import scipy.stats as stats
from .base import BaseThresholder
from .thresh_utility import cut
# https://github.com/vvaezian/modified_thompson_tau_test/blob/main/src/Modified_Thompson_Tau_Test/modified_thompson_tau_test.py
[docs]
class MTT(BaseThresholder):
r"""MTT class for Modified Thompson Tau test thresholder.
Use the modified Thompson Tau test to evaluate a non-parametric means
to threshold scores generated by the decision_scores where outliers
are set to any value beyond the smallest outlier detected by the test.
See :cite:`rengasamy2020mtt` for details.
Parameters
----------
alpha : float, optional (default=0.01)
Confidence level corresponding to the t-Student distribution map to sample
fallback : str ('ignore', 'warn', 'raise'), optional (default='warn')
The action to take for thresholders when their criterion are
not met. In these cases when set to 'ignore' on eval and fit
all train data is set to inliers and the threshold is set to
max of the train scores + eps. Passing 'warn' will do the same as
'ignore' but also produce a warning. If 'raise', the thresholder
raises a ValueError.
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The Modified Thompson Tau test is a modified univariate t-test that eliminates outliers
that are more than a number of standard deviations away from the mean. This method is
done iteratively with the Tau critical value being recalculated after each outlier removal
until the dataset no longer has data points that fall outside of the criterion. The Tau
critical value can be obtained by,
.. math::
\tau = \frac{t \cdot (n-1)}{\sqrt{n}\sqrt{n-2+t^2}} \mathrm{,}
where :math:`n` is the number of data points and :math:`t` is the student t-value
"""
def __init__(self, alpha=0.01, fallback="warn", random_state=1234):
super().__init__(fallback=fallback)
self.alpha = alpha if alpha <= 0.5 else 1 - alpha
self.random_state = random_state
np.random.seed(random_state)
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = self._data_setup(decision)
arr = np.sort(decision.copy())
eps = np.finfo(decision.dtype).eps
limit = 1.0 + eps
while True:
# Calculate the rejection threshold
n = len(arr)
t = stats.t.ppf(1 - self.alpha, df=n - 2)
thres = (t * (n - 1)) / (np.sqrt(n) * np.sqrt(n - 2 + t**2))
delta = np.abs(arr[-1] - arr.mean()) / arr.std()
if delta > thres:
limit = arr[-1]
arr = np.delete(arr, n - 1)
else:
break
self._check_threshold(limit)
self.thresh_ = limit
return cut(decision, limit)