Source code for pythresh.thresholds.regr

import numpy as np
import scipy.stats as stats

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize


[docs] class REGR(BaseThresholder): """REGR class for Regression based thresholder. Use the regression to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the y-intercept value of the linear fit. See :cite:`aggarwal2017clf` for details. Parameters ---------- method : {'siegel', 'theil'}, optional (default='siegel') Regression based method to calculate the y-intercept - 'siegel': implements a method for robust linear regression using repeated medians - 'theil': implements a method for robust linear regression using paired values random_state : int, optional (default=1234) random seed for the normal distribution. Can also be set to None Attributes ---------- thresh_ : threshold value that separates inliers from outliers Examples -------- The effects of randomness can affect the thresholder's output performance significantly. Therefore, to alleviate the effects of randomness on the thresholder a combined model can be used with different random_state values. E.g. .. code:: python # train the KNN detector from pyod.models.knn import KNN from pythresh.thresholds.comb import COMB from pythresh.thresholds.regr import REGR clf = KNN() clf.fit(X_train) # get outlier scores decision_scores = clf.decision_scores_ # raw outlier scores # get outlier labels with combined model thres = COMB(thresholders = [REGR(random_state=1234), REGR(random_state=42), REGR(random_state=9685), REGR(random_state=111222)]) labels = thres.eval(decision_scores) """ def __init__(self, method='siegel', random_state=1234): super().__init__() self.method = method self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Create a normal distribution and normalize norm = np.random.default_rng(self.random_state).normal( loc=0.0, scale=1.0, size=decision.shape) norm = normalize(norm) # Set limit to the y-intercept try: if self.method == 'siegel': res = stats.siegelslopes(norm, decision) elif self.method == 'theil': res = stats.theilslopes(norm, decision) except MemoryError: res = [0.0, 1.0] limit = res[1] self.thresh_ = limit return cut(decision, limit)