Source code for pythresh.thresholds.qmcd

import inspect

import numpy as np
import scipy.stats as stats

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize



[docs]
class QMCD(BaseThresholder):
    """QMCD class for Quasi-Monte Carlo Discrepancy thresholder.

       Use the quasi-Monte Carlo discrepancy to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set to any value beyond and percentile or quantile of one minus the
       discrepancy. See :cite:`iouchtchenko2019qmcd` for details.

       Parameters
       ----------

       method : {'CD', 'WD', 'MD', 'L2-star'}, optional (default='WD')
            Type of discrepancy

            - 'CD':      Centered Discrepancy
            - 'WD':      Wrap-around Discrepancy
            - 'MD':      Mix between CD/WD
            - 'L2-star': L2-star discrepancy

       lim : {'Q', 'P'}, optional (default='P')
            Filtering method to threshold scores using 1 - discrepancy

            - 'Q': Use quantile limiting
            - 'P': Use percentile limiting

       random_state : int, optional (default=1234)
            Random seed for the random number generators of the thresholders. Can also
            be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Notes
       -----

       For the QMCD method it is assumed that the decision scores are pseudo-random
       values within a distribution :math:`M`. "Quasi-random" sequences, which are
       numbers that are better equidistributed for :math:`M` than pseudo-random numbers
       are used to calculate the decision scores discrepancy value.

       The discrepancy value is a uniformity criterion which is used to assess the space
       filling of a number of samples in a hypercube. It quantifies the distance between
       the continuous uniform distribution on a hypercube and the discrete uniform distribution
       on distinct sample points. Therefore, lower values mean better coverage of the parameter
       space.

       The QMCD method utilizes the discrepancy value by assuming that when it is at its lowest
       value (0) the "quasi-random" generated sequences and the decision scores are equally
       equidistributed across :math:`M`. Outliers are assumed to solely raise the discrepancy
       value. And therefore, the contamination of the dataset can be set as one minus the
       discrepancy.
    """

    def __init__(self, method='WD', lim='P', random_state=1234):

        super().__init__()
        self.method = method
        self.lim = lim
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Get the quasi Monte-Carlo discrepancy of the labels
        disc = stats.qmc.discrepancy(
            decision.reshape(-1, 1), method=self.method)

        # Set the limit to either the quantile or percentile of 1-discrepancy
        if self.lim == 'Q':

            limit = np.quantile(decision, 1.0-disc)

        elif self.lim == 'P':

            arg_map = {'old': 'interpolation', 'new': 'method'}
            arg_name = (arg_map['new'] if 'method' in
                        inspect.signature(np.percentile).parameters
                        else arg_map['old'])

            limit = np.percentile(decision, (1.0-disc) *
                                  100, **{arg_name: 'midpoint'})

        self.thresh_ = limit

        return cut(decision, limit)