Spaces:

thu-sail-lab
/

Time_RCD

Running

Time_RCD / models /COPOD.py

Oliver Le

Initial commit

d03866e about 1 month ago

7.31 kB

	"""
	This function is adapted from [pyod] by [yzhao062]
	Original source: [https://github.com/yzhao062/pyod]
	"""

	from __future__ import division
	from __future__ import print_function
	import warnings

	import numpy as np

	from joblib import Parallel, delayed
	from scipy.stats import skew as skew_sp
	from sklearn.utils.validation import check_is_fitted
	from sklearn.utils import check_array

	from .base import BaseDetector
	from ..utils.stat_models import column_ecdf
	from ..utils.utility import _partition_estimators
	from ..utils.utility import zscore

	def skew(X, axis=0):
	return np.nan_to_num(skew_sp(X, axis=axis))

	def _parallel_ecdf(n_dims, X):
	"""Private method to calculate ecdf in parallel.
	Parameters
	----------
	n_dims : int
	The number of dimensions of the current input matrix

	X : numpy array
	The subarray for building the ECDF

	Returns
	-------
	U_l_mat : numpy array
	ECDF subarray.

	U_r_mat : numpy array
	ECDF subarray.
	"""
	U_l_mat = np.zeros([X.shape[0], n_dims])
	U_r_mat = np.zeros([X.shape[0], n_dims])

	for i in range(n_dims):
	U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
	U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
	return U_l_mat, U_r_mat

	class COPOD(BaseDetector):
	"""COPOD class for Copula Based Outlier Detector.
	COPOD is a parameter-free, highly interpretable outlier detection algorithm
	based on empirical copula models.
	See :cite:`li2020copod` for details.

	Parameters
	----------
	contamination : float in (0., 0.5), optional (default=0.1)
	The amount of contamination of the data set, i.e.
	the proportion of outliers in the data set. Used when fitting to
	define the threshold on the decision function.

	n_jobs : optional (default=1)
	The number of jobs to run in parallel for both `fit` and
	`predict`. If -1, then the number of jobs is set to the
	number of cores.

	Attributes
	----------
	decision_scores_ : numpy array of shape (n_samples,)
	The outlier scores of the training data.
	The higher, the more abnormal. Outliers tend to have higher
	scores. This value is available once the detector is
	fitted.
	threshold_ : float
	The threshold is based on ``contamination``. It is the
	``n_samples * contamination`` most abnormal samples in
	``decision_scores_``. The threshold is calculated for generating
	binary outlier labels.
	labels_ : int, either 0 or 1
	The binary labels of the training data. 0 stands for inliers
	and 1 for outliers/anomalies. It is generated by applying
	``threshold_`` on ``decision_scores_``.
	"""

	def __init__(self, contamination=0.1, n_jobs=1, normalize=True):
	super(COPOD, self).__init__(contamination=contamination)

	#TODO: Make it parameterized for n_jobs
	self.n_jobs = n_jobs
	self.normalize = normalize

	def fit(self, X, y=None):
	"""Fit detector. y is ignored in unsupervised methods.
	Parameters
	----------
	X : numpy array of shape (n_samples, n_features)
	The input samples.
	y : Ignored
	Not used, present for API consistency by convention.
	Returns
	-------
	self : object
	Fitted estimator.
	"""
	X = check_array(X)
	if self.normalize: X = zscore(X, axis=1, ddof=1)

	self._set_n_classes(y)
	self.decision_scores_ = self.decision_function(X)
	self.X_train = X
	self._process_decision_scores()
	return self

	def decision_function(self, X):
	"""Predict raw anomaly score of X using the fitted detector.
	For consistency, outliers are assigned with larger anomaly scores.
	Parameters
	----------
	X : numpy array of shape (n_samples, n_features)
	The training input samples. Sparse matrices are accepted only
	if they are supported by the base estimator.
	Returns
	-------
	anomaly_scores : numpy array of shape (n_samples,)
	The anomaly score of the input samples.
	"""
	# use multi-thread execution
	if self.n_jobs != 1:
	return self._decision_function_parallel(X)
	if hasattr(self, 'X_train'):
	original_size = X.shape[0]
	X = np.concatenate((self.X_train, X), axis=0)
	self.U_l = -1 * np.log(column_ecdf(X))
	self.U_r = -1 * np.log(column_ecdf(-X))

	skewness = np.sign(skew(X, axis=0))
	self.U_skew = self.U_l * -1 * np.sign(
	skewness - 1) + self.U_r * np.sign(skewness + 1)
	self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
	if hasattr(self, 'X_train'):
	decision_scores_ = self.O.sum(axis=1)[-original_size:]
	else:
	decision_scores_ = self.O.sum(axis=1)
	return decision_scores_.ravel()

	def _decision_function_parallel(self, X):
	"""Predict raw anomaly score of X using the fitted detector.
	For consistency, outliers are assigned with larger anomaly scores.
	Parameters
	----------
	X : numpy array of shape (n_samples, n_features)
	The training input samples. Sparse matrices are accepted only
	if they are supported by the base estimator.
	Returns
	-------
	anomaly_scores : numpy array of shape (n_samples,)
	The anomaly score of the input samples.
	"""
	if hasattr(self, 'X_train'):
	original_size = X.shape[0]
	X = np.concatenate((self.X_train, X), axis=0)

	n_samples, n_features = X.shape[0], X.shape[1]

	if n_features < 2:
	raise ValueError(
	'n_jobs should not be used on one dimensional dataset')

	if n_features <= self.n_jobs:
	self.n_jobs = n_features
	warnings.warn("n_features <= n_jobs; setting them equal instead.")

	n_jobs, n_dims_list, starts = _partition_estimators(n_features,
	self.n_jobs)

	all_results = Parallel(n_jobs=n_jobs, max_nbytes=None,
	verbose=True)(
	delayed(_parallel_ecdf)(
	n_dims_list[i],
	X[:, starts[i]:starts[i + 1]],
	)
	for i in range(n_jobs))

	# recover the results
	self.U_l = np.zeros([n_samples, n_features])
	self.U_r = np.zeros([n_samples, n_features])

	for i in range(n_jobs):
	self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
	self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]

	self.U_l = -1 * np.log(self.U_l)
	self.U_r = -1 * np.log(self.U_r)

	skewness = np.sign(skew(X, axis=0))
	self.U_skew = self.U_l * -1 * np.sign(
	skewness - 1) + self.U_r * np.sign(skewness + 1)
	self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
	if hasattr(self, 'X_train'):
	decision_scores_ = self.O.sum(axis=1)[-original_size:]
	else:
	decision_scores_ = self.O.sum(axis=1)
	return decision_scores_.ravel()