Source code for pythresh.thresholds.meta

import os
from os.path import dirname as up

import joblib
import numpy as np
import pandas as pd
import scipy.stats as stats
from numba import njit, prange
from sklearn.preprocessing import MinMaxScaler

from .base import BaseThresholder
from .thresh_utility import check_scores, normalize



[docs]
class META(BaseThresholder):
    r"""META class for Meta-modelling thresholder.

       Use a trained meta-model to evaluate a non-parametric means
       to threshold scores generated by the decision_scores where outliers
       are set based on the trained meta-model classifier.
       See :cite:`zhao2022meta` for details.

       Parameters
       ----------

       method : {'LIN', 'GNB', 'GNBC', 'GNBM'}, optional (default='GNBM')
           select

           - 'LIN':  RidgeCV trained linear classifier meta-model on true labels
           - 'GNB':  Gaussian Naive Bayes trained classifier meta-model on true labels
           - 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination
           - 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model

       random_state : int, optional (default=1234)
            Random seed for the random number generators of the thresholders. Can also
            be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Notes
       -----

       Meta-modelling is the creation of a model of models. If a dataset
       that contains only the explanatory variables (X), yet no response
       variable (y), it can still be predicted by using a meta-model. This
       is done by modelling datasets with known response variables that
       are similar to the dataset that is missing the response variable.

       The META thresholder was trained using the ``PyOD`` outlier
       detection methods ``LODA, QMCD, CD, MCD, GMM, KNN, KDE, PCA, Sampling`` and ``IForest``
       on the AD benchmark datasets: ``ALOI, annthyroid, breastw, campaign, cardio,
       Cardiotocography, fault, glass, Hepatitis, Ionosphere, landsat, letter, Lymphography,
       magic.gamma, mammography, mnist, musk, optdigits, PageBlocks, pendigits, Pima,
       satellite, satimage-2, shuttle, smtp, SpamBase, speech, Stamps, thyroid, vertebral,
       vowels, Waveform,  WBC, WDBC, Wilt, wine, WPBC, yeast`` available at
       `ADBench dataset <https://github.com/Minqi824/ADBench/tree/main/adbench/datasets/Classical>`_.
       META uses a majority vote of all the trained models to determine the
       inlier/outlier labels.

       Update: the latest GNBC model was further trained on the ``backdoor, celeba, census,
       cover, donors, fraud, http, InternetAds,`` and ``skin`` datasets and additionally using
       the ``AutoEncoder, LUNAR, OCSVM, HBOS, KPCA,`` and ``DIF`` outlier detection methods.

    """

    def __init__(self, method='GNBM', random_state=1234):

        self.method = method
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        if self.method == 'LIN':
            clf = 'meta_model_LIN.pkl'
        elif self.method == 'GNB':
            clf = 'meta_model_GNB.pkl'
        elif self.method == 'GNBC':
            clf = 'meta_model_GNBC.pkl'
        else:
            clf = 'meta_model_GNBM.pkl'

        contam = []
        counts = len(decision)
        parent = up(up(__file__))
        model = joblib.load(os.path.join(parent, 'models', clf))

        if self.method == 'GNBM':

            scaler = MinMaxScaler()
            norm = scaler.fit_transform(decision.reshape(-1, 1))
            norm = (norm/(norm.max(axis=0, keepdims=True)
                          + np.spacing(0)))

            qmcd = self._wrap_around_discrepancy(norm)

            qmcd = normalize(qmcd)
            if len(qmcd[qmcd > 0.5]) > 0.5*len(qmcd):
                qmcd = 1 - qmcd

            kde = stats.gaussian_kde(decision)
            pdf = normalize(kde.pdf(decision))

        for i in range(len(model.groups_)):

            df = pd.DataFrame()
            df['scores'] = decision
            df['groups'] = i

            if self.method == 'GNBM':

                df['qmcd'] = qmcd
                df['kdes'] = pdf**(1/10)

            labels = model.predict(df)
            outlier_ratio = np.sum(labels)/counts

            if (outlier_ratio < 0.5) & (outlier_ratio > 0):

                contam.append(labels)

        contam = np.array(contam)
        lbls = stats.mode(contam, axis=0)
        lbls = np.squeeze(lbls[0])

        self.thresh_ = None

        return lbls


    @staticmethod
    @njit(fastmath=True, parallel=True)
    def _wrap_around_discrepancy(data):  # pragma: no cover

        n = data.shape[0]
        d = data.shape[1]

        disc = np.zeros(n)

        for i in prange(n):
            dc = 0.0
            for j in prange(n):
                prod = 1.0
                for k in prange(d):
                    x_kikj = abs(data[i, k] - data[j, k])
                    prod *= 3.0 / 2.0 - x_kikj + x_kikj ** 2

                dc += prod
            disc[i] = dc

        return - (4.0 / 3.0) ** d + 1.0 / (n ** 2) * disc