|
|
|
|
|
"""app.ipynb |
|
|
|
|
|
Automatically generated by Colab. |
|
|
|
|
|
Original file is located at |
|
|
https://colab.research.google.com/drive/1iEIEymiaYLbhMx5NqPutAwwI2rMCyIOQ |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import numpy as np |
|
|
import os |
|
|
import sys |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "indobenchmark/indobert-large-p1" |
|
|
try: |
|
|
print(f"Memuat Model Kuat: {MODEL_NAME}...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
model = AutoModel.from_pretrained(MODEL_NAME) |
|
|
print("Model berhasil dimuat.") |
|
|
except Exception as e: |
|
|
print(f"FATAL ERROR: Gagal memuat model IndoBERT-Large. Detail: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def mean_pooling(model_output, attention_mask): |
|
|
token_embeddings = model_output[0] |
|
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
|
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) |
|
|
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
return sum_embeddings / sum_mask |
|
|
|
|
|
|
|
|
def get_embeddings(texts): |
|
|
"""Menghasilkan vektor embedding untuk teks yang diberikan.""" |
|
|
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512) |
|
|
|
|
|
with torch.no_grad(): |
|
|
model_output = model(**encoded_input) |
|
|
|
|
|
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) |
|
|
return F.normalize(sentence_embeddings, p=2, dim=1).numpy() |
|
|
|
|
|
|
|
|
|
|
|
FILE_PATH = "perpustakaan_faq.csv" |
|
|
df_faq = None |
|
|
|
|
|
try: |
|
|
print(f"Mencoba memuat file: {FILE_PATH}...") |
|
|
df_faq = pd.read_csv(FILE_PATH) |
|
|
df_faq = df_faq.rename(columns={'user_query': 'question', 'chatbot_response': 'answer'}) |
|
|
print(f"File '{FILE_PATH}' berhasil dimuat dengan {len(df_faq)} baris data.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nFATAL ERROR: Gagal memuat data CSV. Pastikan file ada di path. Detail: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
faq_embeddings = None |
|
|
if df_faq is not None and not df_faq.empty: |
|
|
print("Menghitung embeddings untuk data FAQ...") |
|
|
faq_questions = df_faq['question'].tolist() |
|
|
faq_embeddings = get_embeddings(faq_questions) |
|
|
print("Penghitungan embeddings selesai.") |
|
|
else: |
|
|
print("FATAL ERROR: Data tidak ditemukan atau kosong.") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def library_chatbot(user_query, threshold=0.60): |
|
|
"""Fungsi utama chatbot untuk merespons pertanyaan pengguna.""" |
|
|
|
|
|
if not user_query: |
|
|
return "Halo! Silakan ajukan pertanyaan seputar perpustakaan." |
|
|
|
|
|
user_embedding = get_embeddings([user_query]) |
|
|
similarities = cosine_similarity(user_embedding, faq_embeddings) |
|
|
best_match_index = np.argmax(similarities) |
|
|
max_similarity = similarities[0][best_match_index] |
|
|
best_response = df_faq.loc[best_match_index, 'answer'] |
|
|
|
|
|
if max_similarity >= threshold: |
|
|
return best_response |
|
|
else: |
|
|
return ( |
|
|
"Mohon maaf, saya belum dapat menemukan jawaban yang spesifik untuk pertanyaan Anda " |
|
|
"dalam data yang saya miliki saat ini (Skor Kemiripan Tertinggi: {:.4f}). " |
|
|
"Silakan ajukan pertanyaan dengan kata kunci yang berbeda.".format(max_similarity) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "Perpustakaan BI Kantor Sumatera Selatan" |
|
|
description = ( |
|
|
"π€ Chatbot ini menggunakan Model **IndoBERT-Large-P1** yang lebih kuat dengan arsitektur RoBERTa. " |
|
|
"Ambang batas (Threshold) diturunkan menjadi **0.60**." |
|
|
) |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=library_chatbot, |
|
|
inputs=gr.Textbox(lines=2, placeholder="Ketik pertanyaan Anda di sini...", label="Pertanyaan Pengguna"), |
|
|
outputs=gr.Textbox(label="Jawaban Chatbot"), |
|
|
title=title, |
|
|
description=description, |
|
|
theme=gr.themes.Soft(), |
|
|
allow_flagging='never' |
|
|
) |
|
|
|
|
|
|
|
|
print("\n--- Menjalankan Gradio ---") |
|
|
print("Tunggu sebentar hingga link publik muncul (Running on public URL: ...)") |
|
|
iface.launch(share=True) |
|
|
|
|
|
|