Source code for pythresh.thresholds.hist

import numpy as np
from scipy import ndimage as ndi

from .base import BaseThresholder
from .thresh_utility import check_scores, normalize

# https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/filters/thresholding.py


[docs] class HIST(BaseThresholder): """HIST class for Histogram based thresholders. Use histograms methods as described in scikit-image.filters to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set by histogram generated thresholds depending on the selected methods. See :cite:`thanammal2015hist` for details. Parameters ---------- nbins : int, optional (default='auto') Number of bins to use in the histogram, default set to int(len(scores)**0.7) method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle') Histogram filtering based method - 'otsu': OTSU's method for filtering - 'yen': Yen's method for filtering - 'isodata': Ridler-Calvard or inter-means method for filtering - 'li': Li's iterative Minimum Cross Entropy method for filtering - 'minimum': Minimum between two maxima via smoothing method for filtering - 'triangle': Triangle algorithm method for filtering random_state : int, optional (default=1234) Random seed for the random number generators of the thresholders. Can also be set to None. Attributes ---------- thresh_ : threshold value that separates inliers from outliers dscores_ : 1D array of decomposed decision scores """ def __init__(self, method='triangle', nbins='auto', random_state=1234): super().__init__() self.nbins = nbins self.method = method self.method_funcs = {'otsu': self._OTSU_thres, 'yen': self._YEN_thres, 'isodata': self._ISODATA_thres, 'li': self._LI_thres, 'minimum': self._Minimum_thres, 'triangle': self._Triangle_thres} self.random_state = random_state
[docs] def eval(self, decision): """Outlier/inlier evaluation process for decision scores. Parameters ---------- decision : np.array or list of shape (n_samples) or np.array of shape (n_samples, n_detectors) which are the decision scores from a outlier detection. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. 0 stands for inliers and 1 for outliers. """ decision = check_scores(decision, random_state=self.random_state) decision = normalize(decision) self.dscores_ = decision # Set adaptive default if bins are None if self.nbins == 'auto': self.nbins = int(len(decision)**0.7) # Generate histogram bin_centers, counts = self._histogram(decision, self.nbins) # Threshold histogram if self.method != 'li': threshold = self.method_funcs[str( self.method)](bin_centers, counts) else: threshold = self.method_funcs[str(self.method)]( decision, bin_centers, counts) # Evaluate scores and create labels outliers = np.where(decision > threshold) scores = np.zeros(len(decision), dtype=int) scores[outliers] = 1 self.thresh_ = threshold return scores
def _histogram(self, decision, nbins): """Generate histograms and get bin centers.""" counts, bin_edges = np.histogram(decision, bins=nbins, range=(0, 1)) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2. return bin_centers, counts def _find_local_maxima_idx(self, hist): """Find the local maxima in histogram.""" maximum_idxs = [] direction = 1 for i in range(hist.shape[0] - 1): if direction > 0: if hist[i + 1] < hist[i]: direction = -1 maximum_idxs.append(i) elif hist[i + 1] > hist[i]: direction = 1 return maximum_idxs def _OTSU_thres(self, bin_centers, counts): """Otsu's method for histogram based thresholding.""" counts = counts.astype(float) # class probabilities for all possible thresholds weight1 = np.cumsum(counts) weight2 = np.cumsum(counts[::-1])[::-1] # class means for all possible thresholds mean1 = np.cumsum(counts * bin_centers) / weight1 mean2 = (np.cumsum((counts * bin_centers)[::-1]) / weight2[::-1])[::-1] # Clip ends to align class 1 and class 2 variables: variance12 = weight1[:-1] * weight2[1:] * (mean1[:-1] - mean2[1:])**2 idx = np.argmax(variance12) return bin_centers[idx] def _YEN_thres(self, bin_centers, counts): """Yen's method for histogram based thresholding.""" # Calculate probability mass function pmf = counts.astype(np.float32) / counts.sum() P1 = np.cumsum(pmf) P1_sq = np.cumsum(pmf ** 2) # Get cumsum calculated from end of squared array: P2_sq = np.cumsum(pmf[::-1] ** 2)[::-1] # Get critical value crit = np.log(((P1_sq[:-1] * P2_sq[1:]) ** -1) * (P1[:-1] * (1.0 - P1[:-1])) ** 2) return bin_centers[crit.argmax()] def _ISODATA_thres(self, bin_centers, counts): """ISODATA method for histogram based thresholding.""" counts = counts.astype(np.float32) # csuml and csumh contain the count of pixels in that bin or lower, and # in all bins strictly higher than that bin, respectively csuml = np.cumsum(counts) csumh = csuml[-1] - csuml # intensity_sum contains the total score intensity from each bin intensity_sum = counts * bin_centers # Get the lower and higher average value of all scores in that bin or lower, and # in all bins strictly higher than that bin, respectively. csum_intensity = np.cumsum(intensity_sum) lower = csum_intensity[:-1] / csuml[:-1] higher = (csum_intensity[-1] - csum_intensity[:-1]) / csumh[:-1] # Find threshold values that meet the criterion t = (l + m)/2 all_mean = (lower + higher) / 2.0 bin_width = bin_centers[1] - bin_centers[0] # Look only at thresholds that are below the actual all_mean value distances = all_mean - bin_centers[:-1] return bin_centers[:-1][(distances >= 0) & (distances < bin_width)][0] def _LI_thres(self, decision, bin_centers, counts): """Li's iterative Minimum Cross Entropy method for histogram. based thresholing """ counts = counts.astype(float) tolerance = np.min(np.diff(np.unique(decision)))/2 t_next = np.mean(decision) # initial new guess for iteration t_curr = -2 * tolerance # initial old guess for iteration # Iterate until the new and old thresholds difference # is less than the tolerance while abs(t_next - t_curr) > tolerance: t_curr = t_next outlier = bin_centers > t_curr inlier = ~outlier mean_out = np.average(bin_centers[outlier], weights=counts[outlier]) mean_in = np.average(bin_centers[inlier], weights=counts[inlier]) t_next = ((mean_in - mean_out) / (np.log(mean_in) - np.log(mean_out))) return t_next def _Minimum_thres(self, bin_centers, counts): """Minimum method for histogram based thresholding.""" smooth_hist = counts.astype(np.float64, copy=False) for _ in range(10000): smooth_hist = ndi.uniform_filter1d(smooth_hist, 3) maximum_idxs = self._find_local_maxima_idx(smooth_hist) if len(maximum_idxs) < 3: break # Find lowest point between the maxima threshold_idx = np.argmin( smooth_hist[maximum_idxs[0]:maximum_idxs[1] + 1]) return bin_centers[maximum_idxs[0] + threshold_idx] def _Triangle_thres(self, bin_centers, counts): """Triangle algorithm for histogram based thresholding.""" nbins = len(counts) # Find peak, lowest and highest score levels. arg_peak_height = np.argmax(counts) peak_height = counts[arg_peak_height] arg_low_level, arg_high_level = np.where(counts > 0)[0][[0, -1]] # Flip is True if left tail is shorter. flip = arg_peak_height - arg_low_level < arg_high_level - arg_peak_height if flip: counts = counts[::-1] arg_low_level = nbins - arg_high_level - 1 arg_peak_height = nbins - arg_peak_height - 1 # Set up the coordinate system. width = arg_peak_height - arg_low_level x1 = np.arange(width) y1 = counts[x1 + arg_low_level] # Normalize. norm = np.sqrt(peak_height**2 + width**2) peak_height /= norm width /= norm # Maximize the length. length = peak_height * x1 - width * y1 arg_level = np.argmax(length) + arg_low_level arg_level = nbins - arg_level - 1 if flip else arg_level return bin_centers[arg_level]