Source code for pythresh.thresholds.meta
import os
from os.path import dirname as up
import joblib
import numpy as np
import pandas as pd
import scipy.stats as stats
from numba import njit, prange
from sklearn.preprocessing import MinMaxScaler
from .base import BaseThresholder
from .thresh_utility import check_scores, normalize
[docs]
class META(BaseThresholder):
r"""META class for Meta-modelling thresholder.
Use a trained meta-model to evaluate a non-parametric means
to threshold scores generated by the decision_scores where outliers
are set based on the trained meta-model classifier.
See :cite:`zhao2022meta` for details.
Parameters
----------
method : {'LIN', 'GNB', 'GNBC', 'GNBM'}, optional (default='GNBM')
select
- 'LIN': RidgeCV trained linear classifier meta-model on true labels
- 'GNB': Gaussian Naive Bayes trained classifier meta-model on true labels
- 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination
- 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
Meta-modelling is the creation of a model of models. If a dataset
that contains only the explanatory variables (X), yet no response
variable (y), it can still be predicted by using a meta-model. This
is done by modelling datasets with known response variables that
are similar to the dataset that is missing the response variable.
The META thresholder was trained using the ``PyOD`` outlier
detection methods ``LODA, QMCD, CD, MCD, GMM, KNN, KDE, PCA, Sampling`` and ``IForest``
on the AD benchmark datasets: ``ALOI, annthyroid, breastw, campaign, cardio,
Cardiotocography, fault, glass, Hepatitis, Ionosphere, landsat, letter, Lymphography,
magic.gamma, mammography, mnist, musk, optdigits, PageBlocks, pendigits, Pima,
satellite, satimage-2, shuttle, smtp, SpamBase, speech, Stamps, thyroid, vertebral,
vowels, Waveform, WBC, WDBC, Wilt, wine, WPBC, yeast`` available at
`ADBench dataset <https://github.com/Minqi824/ADBench/tree/main/adbench/datasets/Classical>`_.
META uses a majority vote of all the trained models to determine the
inlier/outlier labels.
Update: the latest GNBC model was further trained on the ``backdoor, celeba, census,
cover, donors, fraud, http, InternetAds,`` and ``skin`` datasets and additionally using
the ``AutoEncoder, LUNAR, OCSVM, HBOS, KPCA,`` and ``DIF`` outlier detection methods.
"""
def __init__(self, method='GNBM', random_state=1234):
self.method = method
self.random_state = random_state
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = check_scores(decision, random_state=self.random_state)
decision = normalize(decision)
self.dscores_ = decision
if self.method == 'LIN':
clf = 'meta_model_LIN.pkl'
elif self.method == 'GNB':
clf = 'meta_model_GNB.pkl'
elif self.method == 'GNBC':
clf = 'meta_model_GNBC.pkl'
else:
clf = 'meta_model_GNBM.pkl'
contam = []
counts = len(decision)
parent = up(up(__file__))
model = joblib.load(os.path.join(parent, 'models', clf))
if self.method == 'GNBM':
scaler = MinMaxScaler()
norm = scaler.fit_transform(decision.reshape(-1, 1))
norm = (norm/(norm.max(axis=0, keepdims=True)
+ np.spacing(0)))
qmcd = self._wrap_around_discrepancy(norm)
qmcd = normalize(qmcd)
if len(qmcd[qmcd > 0.5]) > 0.5*len(qmcd):
qmcd = 1 - qmcd
kde = stats.gaussian_kde(decision)
pdf = normalize(kde.pdf(decision))
for i in range(len(model.groups_)):
df = pd.DataFrame()
df['scores'] = decision
df['groups'] = i
if self.method == 'GNBM':
df['qmcd'] = qmcd
df['kdes'] = pdf**(1/10)
labels = model.predict(df)
outlier_ratio = np.sum(labels)/counts
if (outlier_ratio < 0.5) & (outlier_ratio > 0):
contam.append(labels)
contam = np.array(contam)
lbls = stats.mode(contam, axis=0)
lbls = np.squeeze(lbls[0])
self.thresh_ = None
return lbls
@staticmethod
@njit(fastmath=True, parallel=True)
def _wrap_around_discrepancy(data): # pragma: no cover
n = data.shape[0]
d = data.shape[1]
disc = np.zeros(n)
for i in prange(n):
dc = 0.0
for j in prange(n):
prod = 1.0
for k in prange(d):
x_kikj = abs(data[i, k] - data[j, k])
prod *= 3.0 / 2.0 - x_kikj + x_kikj ** 2
dc += prod
disc[i] = dc
return - (4.0 / 3.0) ** d + 1.0 / (n ** 2) * disc