Source code for pythresh.thresholds.regr

import numpy as np
import scipy.stats as stats

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize



[docs]
class REGR(BaseThresholder):
    """REGR class for Regression based thresholder.

       Use the regression to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the y-intercept value of the linear fit.
       See :cite:`aggarwal2017clf` for details.

       Parameters
       ----------

       method : {'siegel', 'theil'}, optional (default='siegel')
            Regression based method to calculate the y-intercept

            - 'siegel': implements a method for robust linear regression using repeated medians
            - 'theil':  implements a method for robust linear regression using paired values

       random_state : int, optional (default=1234)
            random seed for the normal distribution. Can also be set to None

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       Examples
       --------
       The effects of randomness can affect the thresholder's output performance
       significantly. Therefore, to alleviate the effects of randomness on the
       thresholder a combined model can be used with different random_state values.
       E.g.

       .. code:: python

            # train the KNN detector
            from pyod.models.knn import KNN
            from pythresh.thresholds.comb import COMB
            from pythresh.thresholds.regr import REGR

            clf = KNN()
            clf.fit(X_train)

            # get outlier scores
            decision_scores = clf.decision_scores_  # raw outlier scores

            # get outlier labels with combined model
            thres = COMB(thresholders = [REGR(random_state=1234),
            REGR(random_state=42), REGR(random_state=9685),
            REGR(random_state=111222)])
            labels = thres.eval(decision_scores)
    """

    def __init__(self, method='siegel', random_state=1234):

        super().__init__()
        self.method = method
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Create a normal distribution and normalize
        norm = np.random.default_rng(self.random_state).normal(
            loc=0.0, scale=1.0, size=decision.shape)
        norm = normalize(norm)

        # Set limit to the y-intercept
        try:
            if self.method == 'siegel':
                res = stats.siegelslopes(norm, decision)
            elif self.method == 'theil':
                res = stats.theilslopes(norm, decision)
        except MemoryError:
            res = [0.0, 1.0]

        limit = res[1]

        self.thresh_ = limit

        return cut(decision, limit)