import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
from torch.utils.data import Dataset, DataLoader
from config import CONFIG

# Import sklearn untuk encoding label sentimen
try:
    from sklearn.preprocessing import LabelEncoder
except ImportError:
    print("⚠️ scikit-learn tidak terinstall, mencoba install...")
    import subprocess
    subprocess.check_call(['pip', 'install', 'scikit-learn'])
    from sklearn.preprocessing import LabelEncoder


class ABSADataset(Dataset):
    """
    Custom Dataset untuk ABSA batch processing.
    Menggabungkan setiap kalimat dengan setiap aspek untuk prediksi.
    """

    def __init__(self, sentences, aspects, tokenizer, max_len):
        """
        Args:
            sentences (list): List dari kalimat input.
            aspects (list): List dari aspek yang ingin diprediksi.
            tokenizer (AutoTokenizer): Tokenizer IndoBERT.
            max_len (int): Panjang maksimum token.
        """
        self.sentences = sentences
        self.aspects = aspects
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Total kombinasi = jumlah kalimat × jumlah aspek
        return len(self.sentences) * len(self.aspects)

    def __getitem__(self, idx):
        """
        Mengembalikan encoded input untuk satu pasangan kalimat-aspek.
        """
        # Hitung indeks kalimat dan aspek dari indeks global
        sent_idx = idx // len(self.aspects)
        aspect_idx = idx % len(self.aspects)

        sentence = self.sentences[sent_idx]
        aspect = self.aspects[aspect_idx]

        # Gabungkan aspek dan kalimat dengan format khusus
        combined = f"[ASPEK] {aspect} [TEXT] {sentence}"

        # Tokenisasi dan encoding text
        encoded = self.tokenizer.encode_plus(
            combined,
            add_special_tokens=True,  # Tambah token [CLS] dan [SEP]
            padding="max_length",     # Padding ke max_length
            max_length=self.max_len,
            truncation=True,          # Potong jika melebihi max_length
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'sent_idx': sent_idx,      # Simpan indeks untuk mapping hasil
            'aspect_idx': aspect_idx
        }


def load_model_and_tokenizer():
    """
    Memuat model IndoBERT ABSA, tokenizer, dan label encoder.
    Jika file tidak ada lokal, akan download dari HuggingFace.

    Returns:
        model (nn.Module): Model ABSA yang sudah diload.
        tokenizer (AutoTokenizer): Tokenizer untuk IndoBERT.
        label_encoder (LabelEncoder): Encoder untuk label sentimen.
        device (torch.device): Device (cuda/cpu) yang digunakan.
    """
    # Setup path direktori model dan tokenizer
    base_path = os.path.abspath(os.path.dirname(__file__))
    model_dir = os.path.join(base_path, "assets", "model")
    tokenizer_dir = os.path.join(base_path, "assets", "tokenizer")

    # Buat direktori jika belum ada
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(tokenizer_dir, exist_ok=True)

    model_path = os.path.join(model_dir, "indobert_absa_model.pth")
    label_path = os.path.join(model_dir, "label_encoder.joblib")

    # === DOWNLOAD MODEL JIKA BELUM ADA ===
    if not os.path.exists(model_path):
        print("📥 Downloading model dari HuggingFace...")
        try:
            # Download dari HuggingFace Hub
            downloaded_model = hf_hub_download(
                repo_id=CONFIG["hf_model_repo"],
                filename="indobert_absa_model.pth",
                subfolder=CONFIG["hf_model_subfolder"],
                cache_dir=None
            )
            # Copy ke direktori lokal
            import shutil
            shutil.copy(downloaded_model, model_path)
            print(f"✅ Model berhasil didownload ke {model_path}")
        except Exception as e:
            print(f"❌ Error downloading model: {e}")
            raise

    # === DOWNLOAD LABEL ENCODER JIKA BELUM ADA ===
    if not os.path.exists(label_path):
        print("📥 Downloading label encoder dari HuggingFace...")
        try:
            downloaded_label = hf_hub_download(
                repo_id=CONFIG["hf_model_repo"],
                filename="label_encoder.joblib",
                subfolder=CONFIG["hf_model_subfolder"],
                cache_dir=None
            )
            import shutil
            shutil.copy(downloaded_label, label_path)
            print(f"✅ Label encoder berhasil didownload ke {label_path}")
        except Exception as e:
            print(f"❌ Error downloading label encoder: {e}")
            raise

    # === DOWNLOAD TOKENIZER JIKA BELUM ADA ===
    tokenizer_files = ["special_tokens_map.json", "tokenizer.json",
                       "tokenizer_config.json", "vocab.txt"]

    # Cek apakah semua file tokenizer sudah ada
    all_tokenizer_exists = all(
        os.path.exists(os.path.join(tokenizer_dir, f)) for f in tokenizer_files
    )

    if not all_tokenizer_exists:
        print("📥 Downloading tokenizer dari HuggingFace...")
        try:
            for file in tokenizer_files:
                if not os.path.exists(os.path.join(tokenizer_dir, file)):
                    # Download setiap file tokenizer
                    downloaded_file = hf_hub_download(
                        repo_id=CONFIG["hf_model_repo"],
                        filename=file,
                        subfolder=CONFIG["hf_tokenizer_subfolder"],
                        cache_dir=None
                    )
                    import shutil
                    shutil.copy(downloaded_file, os.path.join(
                        tokenizer_dir, file))
            print(f"✅ Tokenizer berhasil didownload ke {tokenizer_dir}")
        except Exception as e:
            print(f"❌ Error downloading tokenizer: {e}")
            # Fallback ke pretrained jika gagal
            pass

    # === LOAD TOKENIZER ===
    try:
        # Coba load dari direktori lokal
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
        print("✅ Tokenizer loaded dari lokal")
    except Exception as e:
        # Fallback: load dari pretrained model
        print(
            f"⚠️ Gagal load tokenizer lokal ({e}), menggunakan pretrained...")
        tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

    # === LOAD LABEL ENCODER ===
    try:
        label_encoder = joblib.load(label_path)
        print("✅ Label encoder loaded successfully")
    except Exception as e:
        print(f"❌ Error loading label encoder: {e}")
        raise RuntimeError(
            f"Gagal load label_encoder.joblib. "
            f"Pastikan file valid dan scikit-learn terinstall. Error: {e}"
        )

    # === DEFINISI MODEL ARCHITECTURE ===
    class IndoBertForABSA(nn.Module):
        """
        Model klasifikasi aspek berbasis IndoBERT untuk ABSA.
        Arsitektur: IndoBERT -> LayerNorm -> Dropout -> Linear Classifier
        """

        def __init__(self, num_labels):
            super().__init__()
            # Load pretrained IndoBERT
            self.bert = AutoModel.from_pretrained(
                CONFIG["model_name"], trust_remote_code=True, use_safetensors=True
            )
            # Layer normalisasi untuk stabilitas training
            self.norm = nn.LayerNorm(self.bert.config.hidden_size)
            # Dropout untuk mencegah overfitting
            self.dropout = nn.Dropout(CONFIG["dropout_rate"])
            # Linear layer untuk klasifikasi sentimen
            self.classifier = nn.Linear(
                self.bert.config.hidden_size, num_labels)

        def forward(self, input_ids, attention_mask):
            """
            Forward pass untuk model ABSA.

            Args:
                input_ids (torch.Tensor): Tensor input token IDs.
                attention_mask (torch.Tensor): Tensor mask perhatian.

            Returns:
                torch.Tensor: Logit prediksi.
            """
            # Dapatkan output dari BERT
            output = self.bert(input_ids=input_ids,
                               attention_mask=attention_mask)
            # Gunakan pooler output (representasi [CLS] token)
            pooled = output.pooler_output
            # Normalisasi
            normed = self.norm(pooled)
            # Dropout
            dropped = self.dropout(normed)
            # Klasifikasi
            return self.classifier(dropped)

    # === SETUP DEVICE DAN LOAD MODEL ===
    # Gunakan GPU jika tersedia, jika tidak gunakan CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Inisialisasi model dengan jumlah label dari label encoder
    model = IndoBertForABSA(num_labels=len(label_encoder.classes_))

    try:
        # Load weights model yang sudah di-training
        model.load_state_dict(torch.load(model_path, map_location=device))
        print("✅ Model state dict loaded successfully")
    except Exception as e:
        print(f"❌ Error loading model state dict: {e}")
        raise

    # Pindahkan model ke device (GPU/CPU)
    model.to(device)
    # Set model ke mode evaluasi (nonaktifkan dropout, dll)
    model.eval()

    return model, tokenizer, label_encoder, device


def predict_multi_aspect(model, tokenizer, sentence, aspek_list, label_encoder, device, max_len):
    """
    Melakukan prediksi sentimen untuk setiap aspek pada satu kalimat.
    Proses prediksi dilakukan satu per satu (non-batch).

    Args:
        model (nn.Module): Model ABSA yang sudah diload.
        tokenizer (AutoTokenizer): Tokenizer IndoBERT.
        sentence (str): Kalimat input.
        aspek_list (list): Daftar aspek yang ingin diprediksi.
        label_encoder (LabelEncoder): Encoder label.
        device (torch.device): Device (cuda/cpu).
        max_len (int): Panjang maksimum token.

    Returns:
        dict: Hasil prediksi berupa {aspek: label_sentimen}.
    """
    results = {}

    # Loop untuk setiap aspek
    for aspek in aspek_list:
        # Gabungkan aspek dan kalimat
        combined = f"[ASPEK] {aspek} [TEXT] {sentence}"

        # Tokenisasi input
        encoded = tokenizer.encode_plus(
            combined,
            add_special_tokens=True,
            padding="max_length",
            max_length=max_len,
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Pindahkan tensor ke device
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        # Prediksi tanpa menghitung gradient (inference mode)
        with torch.no_grad():
            # Forward pass
            outputs = model(input_ids, attention_mask)
            # Konversi logits ke probabilitas dengan softmax
            probs = F.softmax(outputs, dim=1).squeeze()
            # Ambil indeks dengan probabilitas tertinggi
            idx = torch.argmax(probs).item()
            # Konversi indeks ke label sentimen
            label = label_encoder.inverse_transform([idx])[0]
            # Simpan hasil
            results[aspek] = label

    return results


def predict_multi_aspect_batch(model, tokenizer, sentences, aspek_list, label_encoder, device, max_len, batch_size=None):
    """
    Melakukan prediksi sentimen untuk setiap aspek pada multiple kalimat menggunakan batch processing.
    Lebih efisien untuk memproses banyak kalimat sekaligus.

    Args:
        model (nn.Module): Model ABSA yang sudah diload.
        tokenizer (AutoTokenizer): Tokenizer IndoBERT.
        sentences (list): List kalimat input.
        aspek_list (list): Daftar aspek yang ingin diprediksi.
        label_encoder (LabelEncoder): Encoder label.
        device (torch.device): Device (cuda/cpu).
        max_len (int): Panjang maksimum token.
        batch_size (int, optional): Ukuran batch. Jika None, gunakan dari CONFIG.

    Returns:
        list: List of dict hasil prediksi [{aspek: label_sentimen}, ...].
    """
    # Set batch size dari CONFIG jika tidak diberikan
    if batch_size is None:
        batch_size = CONFIG.get("batch_size", 32)

    # === BUAT DATASET DAN DATALOADER ===
    # Dataset akan membuat kombinasi semua kalimat × semua aspek
    dataset = ABSADataset(sentences, aspek_list, tokenizer, max_len)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,  # Process dalam batch untuk efisiensi
        shuffle=False,          # Jangan shuffle untuk maintain urutan
        num_workers=CONFIG.get("num_workers", 0)
    )

    # === INISIALISASI CONTAINER HASIL ===
    num_sentences = len(sentences)
    num_aspects = len(aspek_list)
    # Buat matrix untuk menyimpan prediksi [num_sentences x num_aspects]
    all_predictions = [[None] * num_aspects for _ in range(num_sentences)]

    # === BATCH PREDICTION ===
    model.eval()  # Set model ke evaluation mode
    with torch.no_grad():  # Nonaktifkan gradient calculation
        for batch in dataloader:
            # Pindahkan batch ke device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sent_indices = batch['sent_idx'].numpy()
            aspect_indices = batch['aspect_idx'].numpy()

            # Forward pass untuk seluruh batch
            outputs = model(input_ids, attention_mask)
            # Konversi logits ke probabilitas
            probs = F.softmax(outputs, dim=1)
            # Ambil indeks prediksi tertinggi
            pred_indices = torch.argmax(probs, dim=1).cpu().numpy()

            # Konversi indeks ke label sentimen
            labels = label_encoder.inverse_transform(pred_indices)

            # Simpan hasil ke matrix sesuai indeks aslinya
            for i, (sent_idx, aspect_idx, label) in enumerate(zip(sent_indices, aspect_indices, labels)):
                all_predictions[sent_idx][aspect_idx] = label

    # === KONVERSI KE FORMAT DICTIONARY ===
    results = []
    for predictions in all_predictions:
        # Buat dict {aspek: label} untuk setiap kalimat
        result_dict = {aspek: label for aspek,
                       label in zip(aspek_list, predictions)}
        results.append(result_dict)

    return results