import numpy as np
import scipy.stats as stats
from .base import BaseThresholder
from .thresh_utility import cut, normalize
[docs]
class REGR(BaseThresholder):
"""REGR class for Regression based thresholder.
Use the regression to evaluate a non-parametric means
to threshold scores generated by the decision_scores where outliers
are set to any value beyond the y-intercept value of the linear fit.
See :cite:`aggarwal2017clf` for details.
Parameters
----------
method : {'siegel', 'theil'}, optional (default='siegel')
Regression based method to calculate the y-intercept
- 'siegel': implements a method for robust linear regression using repeated medians
- 'theil': implements a method for robust linear regression using paired values
fallback : str ('ignore', 'warn', 'raise'), optional (default='warn')
The action to take for thresholders when their criterion are
not met. In these cases when set to 'ignore' on eval and fit
all train data is set to inliers and the threshold is set to
max of the train scores + eps. Passing 'warn' will do the same as
'ignore' but also produce a warning. If 'raise', the thresholder
raises a ValueError.
random_state : int, optional (default=1234)
random seed for the normal distribution. Can also be set to None
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
Examples
--------
The effects of randomness can affect the thresholder's output performance
significantly. Therefore, to alleviate the effects of randomness on the
thresholder a combined model can be used with different random_state values.
E.g.
.. code:: python
# train the KNN detector
from pyod.models.knn import KNN
from pythresh.thresholds.comb import COMB
from pythresh.thresholds.regr import REGR
clf = KNN()
clf.fit(X_train)
# get outlier scores
decision_scores = clf.decision_scores_ # raw outlier scores
# get outlier labels with combined model
thres = COMB(thresholders = [REGR(random_state=1234),
REGR(random_state=42), REGR(random_state=9685),
REGR(random_state=111222)])
labels = thres.eval(decision_scores)
"""
def __init__(self, method="siegel", fallback="warn", random_state=1234):
super().__init__(fallback=fallback)
self.method = method
self.random_state = random_state
np.random.seed(random_state)
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = self._data_setup(decision)
# Create a normal distribution and normalize
norm = np.random.default_rng(self.random_state).normal(loc=0.0, scale=1.0, size=decision.shape)
norm = normalize(norm)
# Set limit to the y-intercept
try:
if self.method == "siegel":
res = stats.siegelslopes(norm, decision)
elif self.method == "theil":
res = stats.theilslopes(norm, decision)
except MemoryError:
eps = np.finfo(decision.dtype).eps
res = [0.0, 1.0 + eps]
limit = res[1]
self.thresh_ = limit
return cut(decision, limit)