Source code for pythresh.thresholds.clf

import numpy as np
from scipy.stats import gaussian_kde

from .base import BaseThresholder
from .thresh_utility import check_scores, normalize


[docs] class CLF(BaseThresholder): r"""CLF class for Trained Classifier thresholder. Use the trained linear classifier to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond 0. See :cite:`aggarwal2017clf` for details. Parameters ---------- method : {'simple', 'complex'}, optional (default='complex') Type of linear model - 'simple': Uses only the scores - 'complex': Uses the scores, log of the scores, and the scores' PDF random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores Notes ----- The classifier was trained using a linear stochastic gradient decent method. A warm start was assigned to the classifier was partially fit with the decision scores and true labels from multiple outlier detection methods available in `PyOD`. The :code:`generate_data` function from `PyOD` was used to create the outlier data, and the contaminations and random states were randomized each iterative step. """ def __init__(self, method='complex', random_state=1234): if method == 'complex': self.m1 = 7.115947536708103 self.m2 = -5.934885742167458 self.m3 = -3.416078337348704 self.c = 2.5731351150980992 else: self.m = 4.0581548062264075 self.c = -1.5357998356223497 self.method = method self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Calculate expected y if self.method == 'complex': kde = gaussian_kde(decision) pdf = normalize(kde.pdf(decision)) pdf = normalize(pdf**(1/10)) log = normalize(np.log(decision + 1)) pred = self.m1*decision + self.m2*log + self.m3*pdf + self.c else: pred = self.m*decision + self.c # Determine labels pred[pred > 0] = 1 pred[pred <= 0] = 0 self.thresh_ = None return pred.astype(int)