Spaces:
Running
Running
| import os | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import joblib | |
| from transformers import AutoModel, AutoTokenizer | |
| from huggingface_hub import hf_hub_download | |
| from torch.utils.data import Dataset, DataLoader | |
| from config import CONFIG | |
| # Import sklearn untuk encoding label sentimen | |
| try: | |
| from sklearn.preprocessing import LabelEncoder | |
| except ImportError: | |
| print("β οΈ scikit-learn tidak terinstall, mencoba install...") | |
| import subprocess | |
| subprocess.check_call(['pip', 'install', 'scikit-learn']) | |
| from sklearn.preprocessing import LabelEncoder | |
| # Custom Dataset untuk batch processing | |
| class ABSADataset(Dataset): | |
| """ | |
| Custom Dataset untuk ABSA batch processing. | |
| Menggabungkan setiap kalimat dengan setiap aspek untuk prediksi. | |
| """ | |
| def __init__(self, sentences, aspects, tokenizer, max_len): | |
| """ | |
| Args: | |
| sentences (list): List dari kalimat input. | |
| aspects (list): List dari aspek yang ingin diprediksi. | |
| tokenizer (AutoTokenizer): Tokenizer IndoBERT. | |
| max_len (int): Panjang maksimum token. | |
| """ | |
| self.sentences = sentences | |
| self.aspects = aspects | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| def __len__(self): | |
| # Total kombinasi = jumlah kalimat Γ jumlah aspek | |
| return len(self.sentences) * len(self.aspects) | |
| def __getitem__(self, idx): | |
| """ | |
| Mengembalikan encoded input untuk satu pasangan kalimat-aspek. | |
| """ | |
| # Hitung indeks kalimat dan aspek dari indeks global | |
| sent_idx = idx // len(self.aspects) | |
| aspect_idx = idx % len(self.aspects) | |
| sentence = self.sentences[sent_idx] | |
| aspect = self.aspects[aspect_idx] | |
| # Gabungkan aspek dan kalimat dengan format khusus | |
| combined = f"[ASPEK] {aspect} [TEXT] {sentence}" | |
| # Tokenisasi dan encoding text | |
| encoded = self.tokenizer.encode_plus( | |
| combined, | |
| add_special_tokens=True, # Tambah token [CLS] dan [SEP] | |
| padding="max_length", # Padding ke max_length | |
| max_length=self.max_len, | |
| truncation=True, # Potong jika melebihi max_length | |
| return_attention_mask=True, | |
| return_tensors="pt", | |
| ) | |
| return { | |
| 'input_ids': encoded['input_ids'].flatten(), | |
| 'attention_mask': encoded['attention_mask'].flatten(), | |
| 'sent_idx': sent_idx, # Simpan indeks untuk mapping hasil | |
| 'aspect_idx': aspect_idx | |
| } | |
| def load_model_and_tokenizer(): | |
| """ | |
| Memuat model IndoBERT ABSA, tokenizer, dan label encoder. | |
| Jika file tidak ada lokal, akan download dari HuggingFace. | |
| Returns: | |
| model (nn.Module): Model ABSA yang sudah diload. | |
| tokenizer (AutoTokenizer): Tokenizer untuk IndoBERT. | |
| label_encoder (LabelEncoder): Encoder untuk label sentimen. | |
| device (torch.device): Device (cuda/cpu) yang digunakan. | |
| """ | |
| # Setup path direktori model dan tokenizer | |
| base_path = os.path.abspath(os.path.dirname(__file__)) | |
| model_dir = os.path.join(base_path, "assets", "model") | |
| tokenizer_dir = os.path.join(base_path, "assets", "tokenizer") | |
| # Buat direktori jika belum ada | |
| os.makedirs(model_dir, exist_ok=True) | |
| os.makedirs(tokenizer_dir, exist_ok=True) | |
| model_path = os.path.join(model_dir, "indobert_absa_model.pth") | |
| label_path = os.path.join(model_dir, "label_encoder.joblib") | |
| # === DOWNLOAD MODEL JIKA BELUM ADA === | |
| if not os.path.exists(model_path): | |
| print("π₯ Downloading model dari HuggingFace...") | |
| try: | |
| # Download dari HuggingFace Hub | |
| downloaded_model = hf_hub_download( | |
| repo_id=CONFIG["hf_model_repo"], | |
| filename="indobert_absa_model.pth", | |
| subfolder=CONFIG["hf_model_subfolder"], | |
| cache_dir=None | |
| ) | |
| # Copy ke direktori lokal | |
| import shutil | |
| shutil.copy(downloaded_model, model_path) | |
| print(f"β Model berhasil didownload ke {model_path}") | |
| except Exception as e: | |
| print(f"β Error downloading model: {e}") | |
| raise | |
| # === DOWNLOAD LABEL ENCODER JIKA BELUM ADA === | |
| if not os.path.exists(label_path): | |
| print("π₯ Downloading label encoder dari HuggingFace...") | |
| try: | |
| downloaded_label = hf_hub_download( | |
| repo_id=CONFIG["hf_model_repo"], | |
| filename="label_encoder.joblib", | |
| subfolder=CONFIG["hf_model_subfolder"], | |
| cache_dir=None | |
| ) | |
| import shutil | |
| shutil.copy(downloaded_label, label_path) | |
| print(f"β Label encoder berhasil didownload ke {label_path}") | |
| except Exception as e: | |
| print(f"β Error downloading label encoder: {e}") | |
| raise | |
| # === DOWNLOAD TOKENIZER JIKA BELUM ADA === | |
| tokenizer_files = ["special_tokens_map.json", "tokenizer.json", | |
| "tokenizer_config.json", "vocab.txt"] | |
| # Cek apakah semua file tokenizer sudah ada | |
| all_tokenizer_exists = all( | |
| os.path.exists(os.path.join(tokenizer_dir, f)) for f in tokenizer_files | |
| ) | |
| if not all_tokenizer_exists: | |
| print("π₯ Downloading tokenizer dari HuggingFace...") | |
| try: | |
| for file in tokenizer_files: | |
| if not os.path.exists(os.path.join(tokenizer_dir, file)): | |
| # Download setiap file tokenizer | |
| downloaded_file = hf_hub_download( | |
| repo_id=CONFIG["hf_model_repo"], | |
| filename=file, | |
| subfolder=CONFIG["hf_tokenizer_subfolder"], | |
| cache_dir=None | |
| ) | |
| import shutil | |
| shutil.copy(downloaded_file, os.path.join( | |
| tokenizer_dir, file)) | |
| print(f"β Tokenizer berhasil didownload ke {tokenizer_dir}") | |
| except Exception as e: | |
| print(f"β Error downloading tokenizer: {e}") | |
| # Fallback ke pretrained jika gagal | |
| pass | |
| # === LOAD TOKENIZER === | |
| try: | |
| # Coba load dari direktori lokal | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) | |
| print("β Tokenizer loaded dari lokal") | |
| except Exception as e: | |
| # Fallback: load dari pretrained model | |
| print( | |
| f"β οΈ Gagal load tokenizer lokal ({e}), menggunakan pretrained...") | |
| tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"]) | |
| # === LOAD LABEL ENCODER === | |
| try: | |
| label_encoder = joblib.load(label_path) | |
| print("β Label encoder loaded successfully") | |
| except Exception as e: | |
| print(f"β Error loading label encoder: {e}") | |
| raise RuntimeError( | |
| f"Gagal load label_encoder.joblib. " | |
| f"Pastikan file valid dan scikit-learn terinstall. Error: {e}" | |
| ) | |
| # === DEFINISI MODEL ARCHITECTURE === | |
| class IndoBertForABSA(nn.Module): | |
| """ | |
| Model klasifikasi aspek berbasis IndoBERT untuk ABSA. | |
| Arsitektur: IndoBERT -> LayerNorm -> Dropout -> Linear Classifier | |
| """ | |
| def __init__(self, num_labels): | |
| super().__init__() | |
| # Load pretrained IndoBERT | |
| self.bert = AutoModel.from_pretrained( | |
| CONFIG["model_name"], trust_remote_code=True, use_safetensors=True | |
| ) | |
| # Layer normalisasi untuk stabilitas training | |
| self.norm = nn.LayerNorm(self.bert.config.hidden_size) | |
| # Dropout untuk mencegah overfitting | |
| self.dropout = nn.Dropout(CONFIG["dropout_rate"]) | |
| # Linear layer untuk klasifikasi sentimen | |
| self.classifier = nn.Linear( | |
| self.bert.config.hidden_size, num_labels) | |
| def forward(self, input_ids, attention_mask): | |
| """ | |
| Forward pass untuk model ABSA. | |
| Args: | |
| input_ids (torch.Tensor): Tensor input token IDs. | |
| attention_mask (torch.Tensor): Tensor mask perhatian. | |
| Returns: | |
| torch.Tensor: Logit prediksi. | |
| """ | |
| # Dapatkan output dari BERT | |
| output = self.bert(input_ids=input_ids, | |
| attention_mask=attention_mask) | |
| # Gunakan pooler output (representasi [CLS] token) | |
| pooled = output.pooler_output | |
| # Normalisasi | |
| normed = self.norm(pooled) | |
| # Dropout | |
| dropped = self.dropout(normed) | |
| # Klasifikasi | |
| return self.classifier(dropped) | |
| # === SETUP DEVICE DAN LOAD MODEL === | |
| # Gunakan GPU jika tersedia, jika tidak gunakan CPU | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Inisialisasi model dengan jumlah label dari label encoder | |
| model = IndoBertForABSA(num_labels=len(label_encoder.classes_)) | |
| try: | |
| # Load weights model yang sudah di-training | |
| model.load_state_dict(torch.load(model_path, map_location=device)) | |
| print("β Model state dict loaded successfully") | |
| except Exception as e: | |
| print(f"β Error loading model state dict: {e}") | |
| raise | |
| # Pindahkan model ke device (GPU/CPU) | |
| model.to(device) | |
| # Set model ke mode evaluasi (nonaktifkan dropout, dll) | |
| model.eval() | |
| return model, tokenizer, label_encoder, device | |
| def predict_multi_aspect(model, tokenizer, sentence, aspek_list, label_encoder, device, max_len): | |
| """ | |
| Melakukan prediksi sentimen untuk setiap aspek pada satu kalimat. | |
| Proses prediksi dilakukan satu per satu (non-batch). | |
| Args: | |
| model (nn.Module): Model ABSA yang sudah diload. | |
| tokenizer (AutoTokenizer): Tokenizer IndoBERT. | |
| sentence (str): Kalimat input. | |
| aspek_list (list): Daftar aspek yang ingin diprediksi. | |
| label_encoder (LabelEncoder): Encoder label. | |
| device (torch.device): Device (cuda/cpu). | |
| max_len (int): Panjang maksimum token. | |
| Returns: | |
| dict: Hasil prediksi berupa {aspek: label_sentimen}. | |
| """ | |
| results = {} | |
| # Loop untuk setiap aspek | |
| for aspek in aspek_list: | |
| # Gabungkan aspek dan kalimat | |
| combined = f"[ASPEK] {aspek} [TEXT] {sentence}" | |
| # Tokenisasi input | |
| encoded = tokenizer.encode_plus( | |
| combined, | |
| add_special_tokens=True, | |
| padding="max_length", | |
| max_length=max_len, | |
| truncation=True, | |
| return_attention_mask=True, | |
| return_tensors="pt", | |
| ) | |
| # Pindahkan tensor ke device | |
| input_ids = encoded["input_ids"].to(device) | |
| attention_mask = encoded["attention_mask"].to(device) | |
| # Prediksi tanpa menghitung gradient (inference mode) | |
| with torch.no_grad(): | |
| # Forward pass | |
| outputs = model( in |