Source code for pythresh.thresholds.chau

import numpy as np
import scipy.stats as stats
from scipy.special import erfc

from .base import BaseThresholder
from .thresh_utility import check_scores, cut, normalize



[docs]
class CHAU(BaseThresholder):
    r"""CHAU class for Chauvenet's criterion thresholder.

       Use the Chauvenet's criterion to evaluate a non-parametric
       means to threshold scores generated by the decision_scores
       where outliers are set to any value below the Chauvenet's
       criterion. See :cite:`bolshev2016chau` for details

       Parameters
       ----------

       method : {'mean', 'median', 'gmean'}, optional (default='mean')
            Calculate the area normal to distance using a scaler

            - 'mean':  Construct a scaler with the the mean of the scores
            - 'median: Construct a scaler with the the median of the scores
            - 'gmean': Construct a scaler with the geometric mean of the scores

       random_state : int, optional (default=1234)
            Random seed for the random number generators of the thresholders. Can also
            be set to None.

       Attributes
       ----------

       thresh_ : threshold value that separates inliers from outliers

       dscores_ : 1D array of decomposed decision scores

       Notes
       -----

       The Chauvenet's criterion for a one tail of a distribution is defined
       as follows:

       .. math::

           D_{\mathrm{max}}>Z \mathrm{,}

       where :math:`D_{\mathrm{max}}` is the bounds of the probability band
       around the mean given by,

       .. math::

           D_{\mathrm{max}} = \lvert norm.ppf(Pz) \rvert \mathrm{,}

       where this bounds is equal to the inverse of a cumulative distribution function
       for a probability of one of the tails of the normal distribution, and :math:`P_z`
       is therefore defined as,

       .. math::

           P_z = \frac{1}{4n} \mathrm{,}

       with :math:`n` being the number of samples in the decision scores. Finally the z-score
       can be calculated as follows:

       .. math::

           Z = \frac{x-\bar{x}}{\sigma} \mathrm{,}

       with :math:`\bar{x}` as the mean and :math:`\sigma` the standard deviation
       of the decision scores.

       CHAU employs variants of the classical Chauvenet's criterion as the mean can be
       replaced with the geometric mean or the median.

       Any z-score greater than the Chauvenet's criterion is considered an outlier.


    """

    def __init__(self, method='mean', random_state=1234):

        super().__init__()
        stat = {'mean': np.mean, 'median': np.median, 'gmean': stats.gmean}
        self.method = stat[method]
        self.random_state = random_state


[docs]
    def eval(self, decision):
        """Outlier/inlier evaluation process for decision scores.

        Parameters
        ----------
        decision : np.array or list of shape (n_samples)
                   or np.array of shape (n_samples, n_detectors)
                   which are the decision scores from a
                   outlier detection.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        """

        decision = check_scores(decision, random_state=self.random_state)

        decision = normalize(decision)

        self.dscores_ = decision

        # Calculate Chauvenet's criterion for one tail
        Pz = 1/(4*len(decision))
        criterion = 1/abs(stats.norm.ppf(Pz))

        # Get area normal to distance
        prob = erfc(np.abs(decision-self.method(decision)) /
                    decision.std()/2.0**0.5)

        self.thresh_ = criterion * (1-np.min(prob))/np.max(prob)

        return 1-cut(prob, criterion)