"""
This function is adapted from [pyod] by [yzhao062]
Original source: [https://github.com/yzhao062/pyod]
"""

from __future__ import division
from __future__ import print_function
import warnings

import numpy as np

from joblib import Parallel, delayed
from scipy.stats import skew as skew_sp
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array

from .base import BaseDetector
from ..utils.stat_models import column_ecdf
from ..utils.utility import _partition_estimators
from ..utils.utility import zscore

def skew(X, axis=0):
    return np.nan_to_num(skew_sp(X, axis=axis))

def _parallel_ecdf(n_dims, X):
    """Private method to calculate ecdf in parallel.    
    Parameters
    ----------
    n_dims : int
        The number of dimensions of the current input matrix

    X : numpy array
        The subarray for building the ECDF

    Returns
    -------
    U_l_mat : numpy array
        ECDF subarray.

    U_r_mat : numpy array
        ECDF subarray.
    """
    U_l_mat = np.zeros([X.shape[0], n_dims])
    U_r_mat = np.zeros([X.shape[0], n_dims])

    for i in range(n_dims):
        U_l_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1])
        U_r_mat[:, i: i + 1] = column_ecdf(X[:, i: i + 1] * -1)
    return U_l_mat, U_r_mat

class COPOD(BaseDetector):
    """COPOD class for Copula Based Outlier Detector.
    COPOD is a parameter-free, highly interpretable outlier detection algorithm
    based on empirical copula models.
    See :cite:`li2020copod` for details.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
        
    n_jobs : optional (default=1)
        The number of jobs to run in parallel for both `fit` and
        `predict`. If -1, then the number of jobs is set to the
        number of cores.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.
    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, contamination=0.1, n_jobs=1, normalize=True):
        super(COPOD, self).__init__(contamination=contamination)

        #TODO: Make it parameterized for n_jobs
        self.n_jobs = n_jobs
        self.normalize = normalize

    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.
        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.
        y : Ignored
            Not used, present for API consistency by convention.
        Returns
        -------
        self : object
            Fitted estimator.
        """
        X = check_array(X)
        if self.normalize: X = zscore(X, axis=1, ddof=1)

        self._set_n_classes(y)
        self.decision_scores_ = self.decision_function(X)
        self.X_train = X
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.
         For consistency, outliers are assigned with larger anomaly scores.
        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.
        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        # use multi-thread execution
        if self.n_jobs != 1:
            return self._decision_function_parallel(X)
        if hasattr(self, 'X_train'):
            original_size = X.shape[0]
            X = np.concatenate((self.X_train, X), axis=0)
        self.U_l = -1 * np.log(column_ecdf(X))
        self.U_r = -1 * np.log(column_ecdf(-X))

        skewness = np.sign(skew(X, axis=0))
        self.U_skew = self.U_l * -1 * np.sign(
            skewness - 1) + self.U_r * np.sign(skewness + 1)
        self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
        if hasattr(self, 'X_train'):
            decision_scores_ = self.O.sum(axis=1)[-original_size:]
        else:
            decision_scores_ = self.O.sum(axis=1)
        return decision_scores_.ravel()

    def _decision_function_parallel(self, X):
        """Predict raw anomaly score of X using the fitted detector.
         For consistency, outliers are assigned with larger anomaly scores.
        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.
        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        if hasattr(self, 'X_train'):
            original_size = X.shape[0]
            X = np.concatenate((self.X_train, X), axis=0)

        n_samples, n_features = X.shape[0], X.shape[1]

        if n_features < 2:
            raise ValueError(
                'n_jobs should not be used on one dimensional dataset')

        if n_features <= self.n_jobs:
            self.n_jobs = n_features
            warnings.warn("n_features <= n_jobs; setting them equal instead.")

        n_jobs, n_dims_list, starts = _partition_estimators(n_features,
                                                            self.n_jobs)

        all_results = Parallel(n_jobs=n_jobs, max_nbytes=None,
                               verbose=True)(
            delayed(_parallel_ecdf)(
                n_dims_list[i],
                X[:, starts[i]:starts[i + 1]],
            )
            for i in range(n_jobs))

        # recover the results
        self.U_l = np.zeros([n_samples, n_features])
        self.U_r = np.zeros([n_samples, n_features])

        for i in range(n_jobs):
            self.U_l[:, starts[i]:starts[i + 1]] = all_results[i][0]
            self.U_r[:, starts[i]:starts[i + 1]] = all_results[i][1]

        self.U_l = -1 * np.log(self.U_l)
        self.U_r = -1 * np.log(self.U_r)

        skewness = np.sign(skew(X, axis=0))
        self.U_skew = self.U_l * -1 * np.sign(
            skewness - 1) + self.U_r * np.sign(skewness + 1)
        self.O = np.maximum(self.U_skew, np.add(self.U_l, self.U_r) / 2)
        if hasattr(self, 'X_train'):
            decision_scores_ = self.O.sum(axis=1)[-original_size:]
        else:
            decision_scores_ = self.O.sum(axis=1)
        return decision_scores_.ravel()