# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1iEIEymiaYLbhMx5NqPutAwwI2rMCyIOQ """ import pandas as pd import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import torch.nn.functional as F from sklearn.metrics.pairwise import cosine_similarity import numpy as np import os import sys # --- 1. Konfigurasi Model dan Data --- # Model IndoBERT-Large-P1 (Lebih kuat dari MiniLM) MODEL_NAME = "indobenchmark/indobert-large-p1" try: print(f"Memuat Model Kuat: {MODEL_NAME}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) print("Model berhasil dimuat.") except Exception as e: print(f"FATAL ERROR: Gagal memuat model IndoBERT-Large. Detail: {e}") sys.exit(1) # Fungsi Mean Pooling (Wajib untuk model non-SBERT) def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask # Fungsi untuk mendapatkan Sentence Embeddings def get_embeddings(texts): """Menghasilkan vektor embedding untuk teks yang diberikan.""" encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512) with torch.no_grad(): model_output = model(**encoded_input) sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) return F.normalize(sentence_embeddings, p=2, dim=1).numpy() # --- Pemuatan Data CSV --- FILE_PATH = "perpustakaan_faq.csv" df_faq = None try: print(f"Mencoba memuat file: {FILE_PATH}...") df_faq = pd.read_csv(FILE_PATH) df_faq = df_faq.rename(columns={'user_query': 'question', 'chatbot_response': 'answer'}) print(f"File '{FILE_PATH}' berhasil dimuat dengan {len(df_faq)} baris data.") except Exception as e: print(f"\nFATAL ERROR: Gagal memuat data CSV. Pastikan file ada di path. Detail: {e}") sys.exit(1) # --- 2. Pelatihan Data (Pre-computation of FAQ Embeddings) --- faq_embeddings = None if df_faq is not None and not df_faq.empty: print("Menghitung embeddings untuk data FAQ...") faq_questions = df_faq['question'].tolist() faq_embeddings = get_embeddings(faq_questions) print("Penghitungan embeddings selesai.") else: print("FATAL ERROR: Data tidak ditemukan atau kosong.") sys.exit(1) # --- 3. Logika Chatbot (Retrieval) --- # FIX SOLUTIF: Threshold diturunkan menjadi 0.60 def library_chatbot(user_query, threshold=0.60): """Fungsi utama chatbot untuk merespons pertanyaan pengguna.""" if not user_query: return "Halo! Silakan ajukan pertanyaan seputar perpustakaan." user_embedding = get_embeddings([user_query]) similarities = cosine_similarity(user_embedding, faq_embeddings) best_match_index = np.argmax(similarities) max_similarity = similarities[0][best_match_index] best_response = df_faq.loc[best_match_index, 'answer'] if max_similarity >= threshold: return best_response else: return ( "Mohon maaf, saya belum dapat menemukan jawaban yang spesifik untuk pertanyaan Anda " "dalam data yang saya miliki saat ini (Skor Kemiripan Tertinggi: {:.4f}). " "Silakan ajukan pertanyaan dengan kata kunci yang berbeda.".format(max_similarity) ) # --- 4. Tampilkan Hasil (Gradio Interface) --- title = "Perpustakaan BI Kantor Sumatera Selatan" description = ( "🤖 Chatbot ini menggunakan Model **IndoBERT-Large-P1** yang lebih kuat dengan arsitektur RoBERTa. " "Ambang batas (Threshold) diturunkan menjadi **0.60**." ) # Buat antarmuka Gradio iface = gr.Interface( fn=library_chatbot, inputs=gr.Textbox(lines=2, placeholder="Ketik pertanyaan Anda di sini...", label="Pertanyaan Pengguna"), outputs=gr.Textbox(label="Jawaban Chatbot"), title=title, description=description, theme=gr.themes.Soft(), allow_flagging='never' ) # Jalankan aplikasi Gradio print("\n--- Menjalankan Gradio ---") print("Tunggu sebentar hingga link publik muncul (Running on public URL: ...)") iface.launch(share=True)