Source code for pythresh.thresholds.fgd

import numpy as np

from .base import BaseThresholder
from .thresh_utility import cut, gen_kde


[docs] class FGD(BaseThresholder): """FGD class for Fixed Gradient Descent thresholder. Use the fixed gradient descent to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond where the first derivative of the kde with respect to the decision scores passes the mean of the first and second inflection points. See :cite:`qi2021fgd` for details. Parameters ---------- fallback : str ('ignore', 'warn', 'raise'), optional (default='warn') The action to take for thresholders when their criterion are not met. In these cases when set to 'ignore' on eval and fit all train data is set to inliers and the threshold is set to max of the train scores + eps. Passing 'warn' will do the same as 'ignore' but also produce a warning. If 'raise', the thresholder raises a ValueError. random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- A probability distribution of the decision scores is generated using kernel density estimation. The first derivative of the pdf is calculated, and the threshold is set as the middle point between the first and second inflection points starting from the left side of the data range. """ def __init__(self, fallback="warn", random_state=1234): super().__init__(fallback=fallback) self.random_state = random_state np.random.seed(random_state)
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = self._data_setup(decision) # Generate KDE val, dat_range = gen_kde(decision, 0, 1, len(decision) * 3) # Calculate the first derivative of the KDE with respect # to the data range deriv = np.gradient(val, dat_range[1] - dat_range[0]) count = 0 ind = [] # Find the first two inflection points for i in range(len(deriv) - 1): if (deriv[i] > 0) & (deriv[i + 1] <= 0): count += 1 ind.append(i) if count == 2: break eps = np.finfo(decision.dtype).eps limit = (dat_range[ind[0]] + dat_range[ind[1]]) / 2 if len(ind) > 1 else 1.0 + eps self._check_threshold(limit) self.thresh_ = limit return cut(decision, limit)