import numpy as np
import scipy.stats as stats
from .base import BaseThresholder
from .thresh_utility import check_scores, cut, gen_kde, normalize
[docs]
class YJ(BaseThresholder):
r"""YJ class for Yeo-Johnson transformation thresholder.
Use the Yeo-Johnson transformation to evaluate
a non-parametric means to threshold scores generated by the
decision_scores where outliers are set to any value beyond the
max value in the YJ transformed data.
See :cite:`raymaekers2021yj` for details.
Parameters
----------
random_state : int, optional (default=1234)
Random seed for the random number generators of the thresholders. Can also
be set to None.
Attributes
----------
thresh_ : threshold value that separates inliers from outliers
dscores_ : 1D array of decomposed decision scores
Notes
-----
The Yeo-Johnson transformation is a power transform which is a
set of power functions that apply a monotonic transformation to
the dataset. For the decision scores this make their distribution
more normal-like. The transformation is given by:
.. math::
\psi_{(y, \lambda)} = \begin{cases}
\left((y+1)^\lambda-1\right)/\lambda & \text{if } \lambda \neq 0 \text{, } y \geq 0 \\
\text{log}(y+1) & \text{if } \lambda = 0 \text{, } y \geq 0 \\
-\left((-y+1)^{(2-\lambda)}-1\right)/{(2-\lambda)} & \text{if } \lambda \neq 2 \text{, } y < 0 \\
-\text{log}(-y+1) & \text{if } \lambda = 2 \text{, } y < 0
\end{cases} \mathrm{,}
where :math:`\lambda` is a power parameter that is chosen via maximum
likelihood estimation. Therefore, any values from the original decision
scores that are beyond maximum value after this transformation are
considered outliers. However, the closer a set of decision scores are
to a normal distribution originally the smaller the probability this
threshold will be able to identify outliers.
"""
def __init__(self, random_state=1234):
self.random_state = random_state
[docs]
def eval(self, decision):
"""Outlier/inlier evaluation process for decision scores.
Parameters
----------
decision : np.array or list of shape (n_samples)
or np.array of shape (n_samples, n_detectors)
which are the decision scores from a
outlier detection.
Returns
-------
outlier_labels : numpy array of shape (n_samples,)
For each observation, tells whether or not
it should be considered as an outlier according to the
fitted model. 0 stands for inliers and 1 for outliers.
"""
decision = check_scores(decision, random_state=self.random_state)
decision = normalize(decision)
self.dscores_ = decision
# Generate KDE
val, _ = gen_kde(decision, 0, 1, len(decision)*3)
# Use Yeo-Johnson transformation to reshape distribution
# iterate to get average transformation
mean_s = np.zeros(len(val))
for _ in range(50):
scores = stats.yeojohnson(val)[0]
mean_s += scores
mean_s = mean_s/50
# Set limit to the max value from the transformation
limit = np.max(mean_s)
self.thresh_ = limit
return cut(decision, limit)