Source code for pythresh.thresholds.decomp

import numpy as np
from sklearn.decomposition import NMF, PCA
from sklearn.random_projection import (
    GaussianRandomProjection,
    SparseRandomProjection
)

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, gen_cdf, normalize



[docs]
class DECOMP(BaseThresholder):
    """DECOMP class for Decomposition based thresholders.

       Use decomposition to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond the maximum of the decomposed
       matrix that results from decomposing the cumulative distribution
       function of the decision scores.
       See :cite:`boente2002decomp` for details

       Parameters
       ----------

       method : {'NMF', 'PCA', 'GRP', 'SRP'}, optional (default='PCA')
            Method to use for decomposition

            - 'NMF':  Non-Negative Matrix Factorization
            - 'PCA':  Principal Component Analysis
            - 'GRP':  Gaussian Random Projection
            - 'SRP':  Sparse Random Projection

       random_state : int, optional (default=1234)
            Random seed for the decomposition algorithm. Can also be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Examples
       --------
       The effects of randomness can affect the thresholder's output performance
       significantly. Therefore, to alleviate the effects of randomness on the
       thresholder a combined model can be used with different random_state values.
       E.g.

       .. code:: python

            # train the KNN detector
            from pyod.models.knn import KNN
            from pythresh.thresholds.comb import COMB
            from pythresh.thresholds.decomp import DECOMP

            clf = KNN()
            clf.fit(X_train)

            # get outlier scores
            decision_scores = clf.decision_scores_  # raw outlier scores

            # get outlier labels with combined model
            thres = COMB(thresholders = [DECOMP(random_state=1234),
            DECOMP(random_state=42), DECOMP(random_state=9685),
            DECOMP(random_state=111222)])
            labels = thres.eval(decision_scores)
    """

    def __init__(self, method='PCA', random_state=1234):

        self.method = method
        self.method_funcs = {'NMF': NMF(n_components=1,
                                        random_state=random_state),
                             'PCA': PCA(n_components=1,
                                        random_state=random_state),
                             'GRP': GaussianRandomProjection(n_components=2,
                                                             random_state=random_state),
                             'SRP': SparseRandomProjection(n_components=3,
                                                           random_state=random_state)}
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Generate a CDF of the decision scores
        val, dat_range = gen_cdf(decision, 0, 1, len(decision)*3)
        val = normalize(val)

        # Apply decomposition
        dec = self.method_funcs[str(self.method)].fit_transform(
            val.reshape(-1, 1))

        # Set limit to max value from decomposition matrix
        limit = np.max(dec)
        limit = 1-limit if limit > 0.5 else limit

        self.thresh_ = limit

        return cut(decision, limit)