Source code for pythresh.thresholds.clf

import numpy as np
from scipy.stats import gaussian_kde

from .base import BaseThresholder
from .thresh_utility import check_scores, normalize



[docs]
class CLF(BaseThresholder):
    r"""CLF class for Trained Classifier thresholder.

       Use the trained linear classifier to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond 0. See :cite:`aggarwal2017clf` for details.

       Parameters
       ----------

       method : {'simple', 'complex'}, optional (default='complex')
            Type of linear model

            - 'simple':  Uses only the scores
            - 'complex': Uses the scores, log of the scores, and the scores' PDF

       random_state : int, optional (default=1234)
            Random seed for the random number generators of the thresholders. Can also
            be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Notes
       -----

       The classifier was trained using a linear stochastic gradient decent method.
       A warm start was assigned to the classifier was partially fit with the decision
       scores and true labels from multiple outlier detection methods available in `PyOD`.
       The :code:`generate_data` function from `PyOD` was used to create the outlier data,
       and the contaminations and random states were randomized each iterative step.


    """

    def __init__(self, method='complex', random_state=1234):

        if method == 'complex':

            self.m1 = 7.115947536708103
            self.m2 = -5.934885742167458
            self.m3 = -3.416078337348704
            self.c = 2.5731351150980992

        else:

            self.m = 4.0581548062264075
            self.c = -1.5357998356223497

        self.method = method
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Calculate expected y
        if self.method == 'complex':

            kde = gaussian_kde(decision)
            pdf = normalize(kde.pdf(decision))
            pdf = normalize(pdf**(1/10))
            log = normalize(np.log(decision + 1))

            pred = self.m1*decision + self.m2*log + self.m3*pdf + self.c

        else:
            pred = self.m*decision + self.c

        # Determine labels
        pred[pred > 0] = 1
        pred[pred <= 0] = 0

        self.thresh_ = None

        return pred.astype(int)