import numpy as np
from scipy.stats import gaussian_kde
from .base import BaseThresholder
[docs]
class CLF(BaseThresholder):
r"""CLF class for Trained Classifier thresholder.
Use the trained linear classifier to evaluate a non-parametric means
to threshold scores generated by the decision_scores where outliers
are set to any value beyond 0. See :cite:`aggarwal2017clf` for details.
Parameters
----------
method : {'simple', 'complex'}, optional (default='complex')
Type of linear model
- 'simple': Uses only the scores
- 'complex': Uses the scores, log of the scores, and the scores' PDF
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The classifier was trained using a linear stochastic gradient decent method.
A warm start was assigned to the classifier was partially fit with the decision
scores and true labels from multiple outlier detection methods available in `PyOD`.
The :code:`generate_data` function from `PyOD` was used to create the outlier data,
and the contaminations and random states were randomized each iterative step.
"""
def __init__(self, method="complex", random_state=1234):
super().__init__()
if method == "complex":
self.m1 = 7.115947536708103
self.m2 = -5.934885742167458
self.m3 = -3.416078337348704
self.c = 2.5731351150980992
else:
self.m = 4.0581548062264075
self.c = -1.5357998356223497
self.method = method
self.random_state = random_state
np.random.seed(random_state)
self._attrs = ["_kde", "_knorm", "_pnorm", "_lnorm"]
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
if self._is_fitted is None:
self._set_attributes(self._attrs, None)
decision = self._data_setup(decision)
# Calculate expected y
if self.method == "complex":
if self._kde is None:
kde = gaussian_kde(decision)
self._kde = kde
pdf = self._kde.pdf(decision)
pdf = self._set_norm(pdf, "_knorm")
pdf[pdf < 0] = 0
pdf = pdf ** (1 / 10)
pdf = self._set_norm(pdf, "_pnorm")
log = np.log(decision + 1)
log = self._set_norm(log, "_lnorm")
log[log < 0] = 0
pred = self.m1 * decision + self.m2 * log + self.m3 * pdf + self.c
else:
pred = self.m * decision + self.c
# Determine labels
pred[pred > 0] = 1
pred[pred <= 0] = 0
self.thresh_ = None
return pred.astype(int)