Source code for pythresh.thresholds.qmcd

import inspect

import numpy as np
import scipy.stats as stats

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize


[docs] class QMCD(BaseThresholder): """QMCD class for Quasi-Monte Carlo Discrepancy thresholder. Use the quasi-Monte Carlo discrepancy to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond and percentile or quantile of one minus the discrepancy. See :cite:`iouchtchenko2019qmcd` for details. Parameters ---------- method : {'CD', 'WD', 'MD', 'L2-star'}, optional (default='WD') Type of discrepancy - 'CD': Centered Discrepancy - 'WD': Wrap-around Discrepancy - 'MD': Mix between CD/WD - 'L2-star': L2-star discrepancy lim : {'Q', 'P'}, optional (default='P') Filtering method to threshold scores using 1 - discrepancy - 'Q': Use quantile limiting - 'P': Use percentile limiting random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- For the QMCD method it is assumed that the decision scores are pseudo-random values within a distribution :math:`M`. "Quasi-random" sequences, which are numbers that are better equidistributed for :math:`M` than pseudo-random numbers are used to calculate the decision scores discrepancy value. The discrepancy value is a uniformity criterion which is used to assess the space filling of a number of samples in a hypercube. It quantifies the distance between the continuous uniform distribution on a hypercube and the discrete uniform distribution on distinct sample points. Therefore, lower values mean better coverage of the parameter space. The QMCD method utilizes the discrepancy value by assuming that when it is at its lowest value (0) the "quasi-random" generated sequences and the decision scores are equally equidistributed across :math:`M`. Outliers are assumed to solely raise the discrepancy value. And therefore, the contamination of the dataset can be set as one minus the discrepancy. """ def __init__(self, method='WD', lim='P', random_state=1234): super().__init__() self.method = method self.lim = lim self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Get the quasi Monte-Carlo discrepancy of the labels disc = stats.qmc.discrepancy( decision.reshape(-1, 1), method=self.method) # Set the limit to either the quantile or percentile of 1-discrepancy if self.lim == 'Q': limit = np.quantile(decision, 1.0-disc) elif self.lim == 'P': arg_map = {'old': 'interpolation', 'new': 'method'} arg_name = (arg_map['new'] if 'method' in inspect.signature(np.percentile).parameters else arg_map['old']) limit = np.percentile(decision, (1.0-disc) * 100, **{arg_name: 'midpoint'}) self.thresh_ = limit return cut(decision, limit)