Source code for pythresh.thresholds.yj

import numpy as np
import scipy.stats as stats

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, gen_kde, normalize



[docs]
class YJ(BaseThresholder):
    r"""YJ class for Yeo-Johnson transformation thresholder.

       Use the Yeo-Johnson transformation to evaluate
       a non-parametric means to threshold scores generated by the
       decision_scores where outliers are set to any value beyond the
       max value in the YJ transformed data.
       See :cite:`raymaekers2021yj` for details.

       Parameters
       ----------

       random_state : int, optional (default=1234)
            Random seed for the random number generators of the thresholders. Can also
            be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Notes
       -----

       The Yeo-Johnson transformation is a power transform which is a
       set of power functions that apply a monotonic transformation to
       the dataset. For the decision scores this make their distribution
       more normal-like. The transformation is given by:

       .. math::

           \psi_{(y, \lambda)} = \begin{cases}
                                 \left((y+1)^\lambda-1\right)/\lambda & \text{if } \lambda \neq 0 \text{, } y \geq 0 \\
                                 \text{log}(y+1) & \text{if } \lambda = 0 \text{, } y \geq 0 \\
                                 -\left((-y+1)^{(2-\lambda)}-1\right)/{(2-\lambda)} & \text{if } \lambda \neq 2 \text{, } y < 0 \\
                                 -\text{log}(-y+1) & \text{if } \lambda = 2 \text{, } y < 0
                                 \end{cases} \mathrm{,}


       where :math:`\lambda` is a power parameter that is chosen via maximum
       likelihood estimation. Therefore, any values from the original decision
       scores that are beyond maximum value after this transformation are
       considered outliers. However, the closer a set of decision scores are
       to a normal distribution originally the smaller the probability this
       threshold will be able to identify outliers.

    """

    def __init__(self, random_state=1234):

        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Generate KDE
        val, _ = gen_kde(decision, 0, 1, len(decision)*3)

        # Use Yeo-Johnson transformation to reshape distribution
        # iterate to get average transformation
        mean_s = np.zeros(len(val))
        for _ in range(50):
            scores = stats.yeojohnson(val)[0]
            mean_s += scores
        mean_s = mean_s/50

        # Set limit to the max value from the transformation
        limit = np.max(mean_s)

        self.thresh_ = limit

        return cut(decision, limit)