Spaces:
Running
Running
perbaiki model_utils.py and back app.py, visualization.py
Browse files- app.py +197 -288
- model_utils.py +85 -2
- visualization.py +86 -254
app.py
CHANGED
|
@@ -6,8 +6,6 @@ berbasis aspek dari kritik dan saran mahasiswa.
|
|
| 6 |
UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
|
| 7 |
UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
|
| 8 |
"""
|
| 9 |
-
|
| 10 |
-
# Import library yang diperlukan
|
| 11 |
import os
|
| 12 |
import time
|
| 13 |
import gc
|
|
@@ -38,48 +36,41 @@ from visualization import (
|
|
| 38 |
from preprocessing import text_preprocessing_pipeline
|
| 39 |
|
| 40 |
# Konfigurasi untuk chunked processing
|
| 41 |
-
CHUNK_SIZE = 2500
|
| 42 |
-
ENABLE_CHUNKED = True
|
| 43 |
-
CACHE_EXPIRY_HOURS = 24
|
| 44 |
|
| 45 |
-
# Membuat direktori cache jika belum ada
|
| 46 |
os.makedirs("chache_file", exist_ok=True)
|
| 47 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 48 |
|
| 49 |
-
# Konfigurasi halaman
|
| 50 |
st.set_page_config(
|
| 51 |
page_title="ABSA IndoBERT",
|
| 52 |
layout="wide",
|
| 53 |
page_icon="💬"
|
| 54 |
)
|
| 55 |
|
| 56 |
-
# Load custom CSS
|
| 57 |
with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
|
| 58 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 59 |
st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
|
| 60 |
|
| 61 |
|
| 62 |
def get_session_id():
|
| 63 |
-
"""
|
| 64 |
-
Generate atau retrieve session ID untuk user - PERSISTENT across refresh
|
| 65 |
-
Menggunakan query params agar session tetap konsisten saat refresh
|
| 66 |
-
"""
|
| 67 |
query_params = st.query_params
|
| 68 |
|
| 69 |
-
# Cek jika sudah ada session ID di URL
|
| 70 |
if "sid" in query_params:
|
| 71 |
sid = query_params["sid"]
|
| 72 |
st.session_state.session_id = sid
|
| 73 |
return sid
|
| 74 |
|
| 75 |
-
# Buat session ID baru jika belum ada
|
| 76 |
if "session_id" not in st.session_state:
|
| 77 |
new_session_id = str(uuid.uuid4())
|
| 78 |
st.session_state.session_id = new_session_id
|
| 79 |
st.query_params["sid"] = new_session_id
|
| 80 |
return new_session_id
|
| 81 |
|
| 82 |
-
# Gunakan session ID yang sudah ada
|
| 83 |
existing_id = st.session_state.session_id
|
| 84 |
st.query_params["sid"] = existing_id
|
| 85 |
return existing_id
|
|
@@ -101,10 +92,7 @@ def get_session_chunks_dir():
|
|
| 101 |
|
| 102 |
|
| 103 |
def cleanup_old_sessions():
|
| 104 |
-
"""
|
| 105 |
-
Hapus session cache yang sudah expired (> 24 jam)
|
| 106 |
-
Membersihkan cache lama untuk menghemat storage
|
| 107 |
-
"""
|
| 108 |
sessions_dir = Path("chache_file/sessions")
|
| 109 |
if not sessions_dir.exists():
|
| 110 |
return
|
|
@@ -115,7 +103,6 @@ def cleanup_old_sessions():
|
|
| 115 |
mod_time = session_dir.stat().st_mtime
|
| 116 |
age_hours = (current_time - mod_time) / 3600
|
| 117 |
|
| 118 |
-
# Hapus jika lebih dari 24 jam
|
| 119 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 120 |
try:
|
| 121 |
shutil.rmtree(session_dir)
|
|
@@ -124,24 +111,18 @@ def cleanup_old_sessions():
|
|
| 124 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 125 |
|
| 126 |
|
| 127 |
-
# Jalankan cleanup saat aplikasi dimulai
|
| 128 |
cleanup_old_sessions()
|
| 129 |
|
| 130 |
|
| 131 |
@st.cache_resource(show_spinner=False)
|
| 132 |
def get_model_resources():
|
| 133 |
-
"""
|
| 134 |
-
Memuat model dan tokenizer IndoBERT
|
| 135 |
-
Menggunakan cache agar model tidak dimuat ulang setiap kali
|
| 136 |
-
"""
|
| 137 |
return load_model_and_tokenizer()
|
| 138 |
|
| 139 |
|
| 140 |
-
# Load model dengan spinner
|
| 141 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 142 |
model, tokenizer, le, device = get_model_resources()
|
| 143 |
|
| 144 |
-
# Tampilkan notifikasi sukses sementara
|
| 145 |
success_placeholder = st.empty()
|
| 146 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 147 |
time.sleep(1)
|
|
@@ -149,7 +130,7 @@ success_placeholder.empty()
|
|
| 149 |
|
| 150 |
|
| 151 |
def convert_df_to_excel(df):
|
| 152 |
-
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream
|
| 153 |
output = BytesIO()
|
| 154 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 155 |
df.to_excel(writer, index=False)
|
|
@@ -157,7 +138,7 @@ def convert_df_to_excel(df):
|
|
| 157 |
|
| 158 |
|
| 159 |
def clear_memory():
|
| 160 |
-
"""Clear memory cache
|
| 161 |
gc.collect()
|
| 162 |
if torch.cuda.is_available():
|
| 163 |
torch.cuda.empty_cache()
|
|
@@ -165,20 +146,8 @@ def clear_memory():
|
|
| 165 |
|
| 166 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 167 |
"""
|
| 168 |
-
Memproses satu chunk data dengan batch processing
|
| 169 |
-
|
| 170 |
-
STEP 2: Batch Prediction menggunakan model IndoBERT
|
| 171 |
-
STEP 3: Combine results dan simpan ke file CSV
|
| 172 |
-
|
| 173 |
-
Args:
|
| 174 |
-
chunk_dataframe: Data chunk yang akan diproses
|
| 175 |
-
chunk_num: Nomor chunk saat ini
|
| 176 |
-
total_chunk_count: Total jumlah chunk
|
| 177 |
-
progress_bar: Progress bar Streamlit
|
| 178 |
-
status_text: Text status Streamlit
|
| 179 |
-
|
| 180 |
-
Returns:
|
| 181 |
-
result_dataframe: DataFrame hasil prediksi untuk chunk ini
|
| 182 |
"""
|
| 183 |
# STEP 1: Preprocessing (0-100%)
|
| 184 |
cleaned_text_list = []
|
|
@@ -188,7 +157,6 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 188 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 189 |
cleaned_text_list.append(clean_text)
|
| 190 |
|
| 191 |
-
# Update progress bar setiap 50 baris
|
| 192 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 193 |
progress = (idx + 1) / total_rows
|
| 194 |
progress_bar.progress(progress)
|
|
@@ -206,7 +174,6 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 206 |
num_sents = len(cleaned_text_list)
|
| 207 |
num_asps = len(ASPEK_COLUMNS)
|
| 208 |
|
| 209 |
-
# Buat dataset dan dataloader
|
| 210 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 211 |
tokenizer, CONFIG["max_len"])
|
| 212 |
dl = DataLoader(
|
|
@@ -216,13 +183,11 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 216 |
num_workers=0
|
| 217 |
)
|
| 218 |
|
| 219 |
-
# Matrix untuk menyimpan hasil prediksi
|
| 220 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 221 |
|
| 222 |
batch_counter = 0
|
| 223 |
total_batch_count = len(dl)
|
| 224 |
|
| 225 |
-
# Lakukan prediksi batch demi batch
|
| 226 |
model.eval()
|
| 227 |
with torch.no_grad():
|
| 228 |
for batch_data in dl:
|
|
@@ -231,18 +196,15 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 231 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 232 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 233 |
|
| 234 |
-
# Forward pass model
|
| 235 |
model_outputs = model(inp_ids, attn_mask)
|
| 236 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 237 |
predicted_indices = torch.argmax(
|
| 238 |
probabilities, dim=1).cpu().numpy()
|
| 239 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 240 |
|
| 241 |
-
# Simpan hasil prediksi ke matrix
|
| 242 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 243 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 244 |
|
| 245 |
-
# Update progress bar
|
| 246 |
batch_counter += 1
|
| 247 |
progress = batch_counter / total_batch_count
|
| 248 |
progress_bar.progress(progress)
|
|
@@ -254,14 +216,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 254 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 255 |
row_dict = data_row.to_dict()
|
| 256 |
row_dict["kritik_saran"] = cleaned_text_list[idx]
|
| 257 |
-
# Tambahkan hasil prediksi untuk setiap aspek
|
| 258 |
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
|
| 259 |
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
|
| 260 |
result_list.append(row_dict)
|
| 261 |
|
| 262 |
result_dataframe = pd.DataFrame(result_list)
|
| 263 |
|
| 264 |
-
# Simpan chunk ke file CSV
|
| 265 |
chunks_directory = get_session_chunks_dir()
|
| 266 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 267 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
|
@@ -270,17 +230,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 270 |
progress_bar.progress(1.0)
|
| 271 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 272 |
|
| 273 |
-
# Bersihkan memory
|
| 274 |
clear_memory()
|
| 275 |
|
| 276 |
return result_dataframe
|
| 277 |
|
| 278 |
|
| 279 |
def get_available_columns(df):
|
| 280 |
-
"""
|
| 281 |
-
Deteksi kolom-kolom yang tersedia dalam dataframe
|
| 282 |
-
Untuk menentukan visualisasi mana yang bisa ditampilkan
|
| 283 |
-
"""
|
| 284 |
available = {
|
| 285 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 286 |
'has_semester': 'semester' in df.columns,
|
|
@@ -290,8 +246,6 @@ def get_available_columns(df):
|
|
| 290 |
return available
|
| 291 |
|
| 292 |
|
| 293 |
-
# ================== BAGIAN UI UTAMA ==================
|
| 294 |
-
|
| 295 |
# Judul aplikasi
|
| 296 |
st.markdown("""
|
| 297 |
<h1 class='title-center'>ABSA IndoBERT</h1>
|
|
@@ -303,7 +257,7 @@ st.markdown(" ")
|
|
| 303 |
st.markdown(" ")
|
| 304 |
st.markdown(" ")
|
| 305 |
|
| 306 |
-
# Panduan
|
| 307 |
steps = [
|
| 308 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 309 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
@@ -315,7 +269,6 @@ steps = [
|
|
| 315 |
"description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
|
| 316 |
]
|
| 317 |
|
| 318 |
-
# Tampilkan panduan dalam 4 kolom
|
| 319 |
cols = st.columns(len(steps))
|
| 320 |
|
| 321 |
for i, step in enumerate(steps):
|
|
@@ -331,19 +284,18 @@ for i, step in enumerate(steps):
|
|
| 331 |
st.markdown("")
|
| 332 |
st.markdown("")
|
| 333 |
|
| 334 |
-
# Upload file
|
| 335 |
uploaded_file = st.file_uploader(
|
| 336 |
" Upload Data Kritik & Saran",
|
| 337 |
type=["xlsx"],
|
| 338 |
help="File maksimal 200MB dengan format .xlsx"
|
| 339 |
)
|
| 340 |
|
| 341 |
-
#
|
| 342 |
session_cache_dir = get_session_cache_dir()
|
| 343 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 344 |
session_chunks_dir = get_session_chunks_dir()
|
| 345 |
|
| 346 |
-
# Tombol hapus cache data utama
|
| 347 |
if session_result_file.exists():
|
| 348 |
if st.button("Hapus Cache Data"):
|
| 349 |
session_result_file.unlink()
|
|
@@ -351,7 +303,6 @@ if session_result_file.exists():
|
|
| 351 |
time.sleep(1)
|
| 352 |
st.rerun()
|
| 353 |
|
| 354 |
-
# Tombol hapus cache chunks
|
| 355 |
if session_chunks_dir.exists():
|
| 356 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 357 |
if chunk_files:
|
|
@@ -363,7 +314,6 @@ if session_chunks_dir.exists():
|
|
| 363 |
time.sleep(1)
|
| 364 |
st.rerun()
|
| 365 |
|
| 366 |
-
# Tampilkan info file yang di-cache
|
| 367 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 368 |
if not uploaded_file:
|
| 369 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
@@ -384,15 +334,13 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
|
|
| 384 |
else:
|
| 385 |
st.caption(" ")
|
| 386 |
|
| 387 |
-
|
| 388 |
if "df_predicted" not in st.session_state:
|
| 389 |
st.session_state.df_predicted = None
|
| 390 |
|
| 391 |
-
# Load cache jika ada
|
| 392 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 393 |
try:
|
| 394 |
df_cached = pd.read_csv(session_result_file)
|
| 395 |
-
# Konversi kolom tahun ke format yang benar
|
| 396 |
if "tahun" in df_cached.columns:
|
| 397 |
df_cached["tahun"] = pd.to_numeric(
|
| 398 |
df_cached["tahun"], errors='coerce').astype('Int64')
|
|
@@ -402,20 +350,14 @@ if st.session_state.df_predicted is None and session_result_file.exists():
|
|
| 402 |
st.warning(f"Gagal memuat cache: {e}")
|
| 403 |
|
| 404 |
|
| 405 |
-
# ================== PROSES UPLOAD & PREDIKSI ==================
|
| 406 |
if uploaded_file:
|
| 407 |
file_bytes = uploaded_file.getvalue()
|
| 408 |
-
|
| 409 |
-
# Cek apakah file baru atau sama dengan sebelumnya
|
| 410 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 411 |
st.session_state.last_uploaded_file = file_bytes
|
| 412 |
st.session_state.uploaded_filename = uploaded_file.name
|
| 413 |
-
|
| 414 |
try:
|
| 415 |
-
# Baca file Excel
|
| 416 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 417 |
|
| 418 |
-
# Konversi kolom tahun jika ada
|
| 419 |
if "tahun" in df_uploaded.columns:
|
| 420 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 421 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
@@ -423,15 +365,11 @@ if uploaded_file:
|
|
| 423 |
except ValueError as err:
|
| 424 |
st.error(f"Gagal membaca file: {err}")
|
| 425 |
else:
|
| 426 |
-
# Validasi kolom wajib
|
| 427 |
if "kritik_saran" not in df_uploaded.columns:
|
| 428 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 429 |
else:
|
| 430 |
-
# Hapus duplikat berdasarkan kolom kritik_saran
|
| 431 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 432 |
subset=["kritik_saran"])
|
| 433 |
-
|
| 434 |
-
# Tambahkan kolom aspek jika belum ada
|
| 435 |
for aspect_col in ASPEK_COLUMNS:
|
| 436 |
if aspect_col not in df_uploaded.columns:
|
| 437 |
df_uploaded[aspect_col] = None
|
|
@@ -441,11 +379,9 @@ if uploaded_file:
|
|
| 441 |
total_rows = len(df_uploaded)
|
| 442 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 443 |
|
| 444 |
-
# ============ MODE CHUNKED PROCESSING ============
|
| 445 |
if use_chunked:
|
| 446 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 447 |
|
| 448 |
-
# Tampilkan info processing
|
| 449 |
info_col1, info_col2, info_col3 = st.columns(3)
|
| 450 |
with info_col1:
|
| 451 |
st.info(f"**Total data:** {total_rows:,} rows")
|
|
@@ -462,7 +398,6 @@ if uploaded_file:
|
|
| 462 |
chunk_status_text = st.empty()
|
| 463 |
overall_status = st.empty()
|
| 464 |
|
| 465 |
-
# Proses setiap chunk
|
| 466 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 467 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 468 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
@@ -471,7 +406,6 @@ if uploaded_file:
|
|
| 471 |
current_chunk_file = session_chunks_dir / \
|
| 472 |
f"chunk_{current_chunk_number}.csv"
|
| 473 |
|
| 474 |
-
# Cek apakah chunk sudah pernah diproses (ada di cache)
|
| 475 |
if current_chunk_file.exists():
|
| 476 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 477 |
all_chunk_results.append(chunk_result)
|
|
@@ -490,7 +424,6 @@ if uploaded_file:
|
|
| 490 |
time.sleep(0.3)
|
| 491 |
continue
|
| 492 |
|
| 493 |
-
# Proses chunk baru
|
| 494 |
chunk_progress_bar.progress(0)
|
| 495 |
|
| 496 |
chunk_result = process_chunk_batch(
|
|
@@ -499,7 +432,6 @@ if uploaded_file:
|
|
| 499 |
)
|
| 500 |
all_chunk_results.append(chunk_result)
|
| 501 |
|
| 502 |
-
# Hitung estimasi waktu
|
| 503 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 504 |
progress_pct = (processed / total_rows) * 100
|
| 505 |
elapsed = time.time() - start_time
|
|
@@ -514,7 +446,6 @@ if uploaded_file:
|
|
| 514 |
|
| 515 |
time.sleep(0.3)
|
| 516 |
|
| 517 |
-
# Gabungkan semua chunk
|
| 518 |
chunk_status_text.empty()
|
| 519 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 520 |
df_session = pd.concat(
|
|
@@ -524,7 +455,6 @@ if uploaded_file:
|
|
| 524 |
end_time = time.time()
|
| 525 |
duration = end_time - start_time
|
| 526 |
|
| 527 |
-
# ============ MODE BATCH PROCESSING (tanpa chunk) ============
|
| 528 |
else:
|
| 529 |
st.info(
|
| 530 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
|
@@ -534,7 +464,6 @@ if uploaded_file:
|
|
| 534 |
progress_bar = st.progress(0)
|
| 535 |
status_text = st.empty()
|
| 536 |
|
| 537 |
-
# Preprocessing
|
| 538 |
cleaned_text_list = []
|
| 539 |
total_preprocessing = len(df_uploaded)
|
| 540 |
|
|
@@ -552,7 +481,6 @@ if uploaded_file:
|
|
| 552 |
status_text.text("Memulai prediksi...")
|
| 553 |
time.sleep(0.3)
|
| 554 |
|
| 555 |
-
# Batch Prediction
|
| 556 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 557 |
num_sents = len(cleaned_text_list)
|
| 558 |
num_asps = len(ASPEK_COLUMNS)
|
|
@@ -592,7 +520,6 @@ if uploaded_file:
|
|
| 592 |
status_text.text(
|
| 593 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 594 |
|
| 595 |
-
# Combine results
|
| 596 |
result_list = []
|
| 597 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 598 |
row_dict = data_row.to_dict()
|
|
@@ -612,20 +539,16 @@ if uploaded_file:
|
|
| 612 |
end_time = time.time()
|
| 613 |
duration = end_time - start_time
|
| 614 |
|
| 615 |
-
# Simpan hasil ke session state dan cache
|
| 616 |
st.session_state.df_predicted = df_session
|
| 617 |
df_session.to_csv(session_result_file, index=False)
|
| 618 |
|
| 619 |
-
# Simpan metadata file
|
| 620 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 621 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 622 |
f.write(uploaded_file.name)
|
| 623 |
|
| 624 |
-
# Hitung statistik processing
|
| 625 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 626 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 627 |
|
| 628 |
-
# Tampilkan hasil processing
|
| 629 |
if use_chunked:
|
| 630 |
st.success(
|
| 631 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
@@ -645,14 +568,14 @@ if uploaded_file:
|
|
| 645 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 646 |
)
|
| 647 |
|
| 648 |
-
#
|
| 649 |
if st.session_state.df_predicted is not None:
|
| 650 |
df_predicted = st.session_state.df_predicted
|
| 651 |
|
| 652 |
-
# Deteksi kolom yang tersedia
|
| 653 |
available_cols = get_available_columns(df_predicted)
|
| 654 |
|
| 655 |
-
#
|
| 656 |
st.sidebar.header("Filter Data")
|
| 657 |
|
| 658 |
df_clean = df_predicted.copy()
|
|
@@ -664,7 +587,7 @@ if st.session_state.df_predicted is not None:
|
|
| 664 |
st.sidebar.info(
|
| 665 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 666 |
|
| 667 |
-
# Filter Mata Kuliah
|
| 668 |
selected_matkul = []
|
| 669 |
if available_cols['has_matkul']:
|
| 670 |
matkul_options = sorted(
|
|
@@ -673,7 +596,7 @@ if st.session_state.df_predicted is not None:
|
|
| 673 |
selected_matkul = st.sidebar.multiselect(
|
| 674 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 675 |
|
| 676 |
-
# Filter Program Studi
|
| 677 |
selected_prodi = []
|
| 678 |
if available_cols['has_prodi']:
|
| 679 |
prodi_options = sorted(
|
|
@@ -682,10 +605,9 @@ if st.session_state.df_predicted is not None:
|
|
| 682 |
selected_prodi = st.sidebar.multiselect(
|
| 683 |
"Program Studi", prodi_options, default=prodi_options)
|
| 684 |
|
| 685 |
-
# Filter Tahun
|
| 686 |
selected_tahun = []
|
| 687 |
if available_cols['has_tahun']:
|
| 688 |
-
# Konversi tanggal ke tahun jika perlu
|
| 689 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
| 690 |
df_clean['tahun'] = pd.to_datetime(
|
| 691 |
df_clean['tanggal'], errors='coerce').dt.year
|
|
@@ -696,7 +618,7 @@ if st.session_state.df_predicted is not None:
|
|
| 696 |
selected_tahun = st.sidebar.multiselect(
|
| 697 |
"Tahun", tahun_options, default=tahun_options)
|
| 698 |
|
| 699 |
-
# Filter Semester
|
| 700 |
selected_semester = []
|
| 701 |
if available_cols['has_semester']:
|
| 702 |
semester_options = sorted(
|
|
@@ -705,7 +627,7 @@ if st.session_state.df_predicted is not None:
|
|
| 705 |
selected_semester = st.sidebar.multiselect(
|
| 706 |
"Semester", semester_options, default=semester_options)
|
| 707 |
|
| 708 |
-
# Apply
|
| 709 |
df_filtered = df_clean.copy()
|
| 710 |
|
| 711 |
if selected_matkul and available_cols['has_matkul']:
|
|
@@ -714,130 +636,123 @@ if st.session_state.df_predicted is not None:
|
|
| 714 |
|
| 715 |
if selected_prodi and available_cols['has_prodi']:
|
| 716 |
df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
|
| 717 |
-
selected_prodi
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
)
|
| 754 |
-
|
| 755 |
-
st.info(
|
| 756 |
-
f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
|
| 757 |
-
)
|
| 758 |
-
|
| 759 |
-
# ============ RINGKASAN CEPAT ============
|
| 760 |
-
st.markdown("")
|
| 761 |
-
st.markdown("### Ringkasan Cepat")
|
| 762 |
-
st.markdown("")
|
| 763 |
-
|
| 764 |
-
# Hitung total sentimen dari semua aspek
|
| 765 |
-
total_pos=(df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
|
| 766 |
-
total_net=(df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
|
| 767 |
-
total_neg=(df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
|
| 768 |
-
|
| 769 |
-
# Tentukan kolom mana yang tersedia untuk ditampilkan
|
| 770 |
-
summary_cols=[]
|
| 771 |
-
|
| 772 |
-
# Kolom dasar (selalu ada)
|
| 773 |
-
summary_cols.extend(['ulasan', 'aspek'])
|
| 774 |
-
|
| 775 |
-
# Kolom opsional berdasarkan data yang tersedia
|
| 776 |
-
if available_cols['has_matkul']:
|
| 777 |
-
summary_cols.append('matkul')
|
| 778 |
-
if available_cols['has_prodi']:
|
| 779 |
-
summary_cols.append('prodi')
|
| 780 |
-
if available_cols['has_semester']:
|
| 781 |
-
summary_cols.append('semester')
|
| 782 |
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
|
|
|
| 786 |
|
| 787 |
-
|
|
|
|
|
|
|
| 788 |
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
col_idx += 1
|
| 792 |
-
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 793 |
-
col_idx += 1
|
| 794 |
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 805 |
-
col_idx += 1
|
| 806 |
|
| 807 |
-
|
| 808 |
-
if available_cols['has_semester']:
|
| 809 |
-
semester_count=df_filtered['semester'].nunique()
|
| 810 |
-
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
|
| 811 |
-
col_idx += 1
|
| 812 |
|
| 813 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
|
| 815 |
-
|
| 816 |
-
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
if len(tahun_valid) > 0:
|
| 839 |
-
tahun_min=int(tahun_valid.min())
|
| 840 |
-
tahun_max=int(tahun_valid.max())
|
| 841 |
if tahun_min == tahun_max:
|
| 842 |
cols2[col_idx2].metric("Tahun", f"{tahun_min}")
|
| 843 |
else:
|
|
@@ -845,95 +760,89 @@ if st.session_state.df_predicted is not None:
|
|
| 845 |
"Rentang Tahun", f"{tahun_min} - {tahun_max}")
|
| 846 |
else:
|
| 847 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 848 |
-
|
| 849 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 850 |
-
|
| 851 |
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
word_counts=df_filtered['kritik_saran'].astype(
|
| 856 |
str).str.split().str.len()
|
| 857 |
-
avg_word_count=round(word_counts.mean(), 1)
|
| 858 |
cols2[col_idx2].metric(
|
| 859 |
"Rata-rata Panjang Kata", f"{avg_word_count} kata")
|
| 860 |
-
|
| 861 |
cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
|
| 862 |
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
st.markdown("### Visualisasi Data")
|
| 866 |
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
|
| 874 |
-
|
| 875 |
-
|
| 876 |
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
with col1:
|
| 881 |
if available_cols['has_tahun']:
|
| 882 |
-
result=show_year_distribution(df_filtered)
|
| 883 |
if result:
|
| 884 |
-
viz_shown=True
|
| 885 |
-
|
| 886 |
if available_cols['has_semester']:
|
| 887 |
-
result=show_semester_distribution(df_filtered)
|
| 888 |
if result:
|
| 889 |
-
viz_shown=True
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
col1, col2=st.columns(2)
|
| 909 |
-
with col1:
|
| 910 |
if available_cols['has_tahun']:
|
| 911 |
-
result=show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
|
| 912 |
if result:
|
| 913 |
-
viz_shown=True
|
| 914 |
-
|
| 915 |
if available_cols['has_semester']:
|
| 916 |
-
result=show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
|
| 917 |
if result:
|
| 918 |
-
viz_shown=True
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
# ============ FOOTER ============
|
| 935 |
-
st.caption("""
|
| 936 |
<div class='footer'>
|
| 937 |
-
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
| 938 |
</div>
|
| 939 |
""", unsafe_allow_html=True)
|
|
|
|
| 6 |
UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
|
| 7 |
UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
|
| 8 |
"""
|
|
|
|
|
|
|
| 9 |
import os
|
| 10 |
import time
|
| 11 |
import gc
|
|
|
|
| 36 |
from preprocessing import text_preprocessing_pipeline
|
| 37 |
|
| 38 |
# Konfigurasi untuk chunked processing
|
| 39 |
+
CHUNK_SIZE = 2500
|
| 40 |
+
ENABLE_CHUNKED = True
|
| 41 |
+
CACHE_EXPIRY_HOURS = 24
|
| 42 |
|
|
|
|
| 43 |
os.makedirs("chache_file", exist_ok=True)
|
| 44 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 45 |
|
| 46 |
+
# Konfigurasi halaman
|
| 47 |
st.set_page_config(
|
| 48 |
page_title="ABSA IndoBERT",
|
| 49 |
layout="wide",
|
| 50 |
page_icon="💬"
|
| 51 |
)
|
| 52 |
|
| 53 |
+
# Load custom CSS
|
| 54 |
with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
|
| 55 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 56 |
st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
|
| 57 |
|
| 58 |
|
| 59 |
def get_session_id():
|
| 60 |
+
"""Generate atau retrieve session ID untuk user - PERSISTENT across refresh"""
|
|
|
|
|
|
|
|
|
|
| 61 |
query_params = st.query_params
|
| 62 |
|
|
|
|
| 63 |
if "sid" in query_params:
|
| 64 |
sid = query_params["sid"]
|
| 65 |
st.session_state.session_id = sid
|
| 66 |
return sid
|
| 67 |
|
|
|
|
| 68 |
if "session_id" not in st.session_state:
|
| 69 |
new_session_id = str(uuid.uuid4())
|
| 70 |
st.session_state.session_id = new_session_id
|
| 71 |
st.query_params["sid"] = new_session_id
|
| 72 |
return new_session_id
|
| 73 |
|
|
|
|
| 74 |
existing_id = st.session_state.session_id
|
| 75 |
st.query_params["sid"] = existing_id
|
| 76 |
return existing_id
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def cleanup_old_sessions():
|
| 95 |
+
"""Hapus session cache yang sudah expired (> 24 jam)"""
|
|
|
|
|
|
|
|
|
|
| 96 |
sessions_dir = Path("chache_file/sessions")
|
| 97 |
if not sessions_dir.exists():
|
| 98 |
return
|
|
|
|
| 103 |
mod_time = session_dir.stat().st_mtime
|
| 104 |
age_hours = (current_time - mod_time) / 3600
|
| 105 |
|
|
|
|
| 106 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 107 |
try:
|
| 108 |
shutil.rmtree(session_dir)
|
|
|
|
| 111 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 112 |
|
| 113 |
|
|
|
|
| 114 |
cleanup_old_sessions()
|
| 115 |
|
| 116 |
|
| 117 |
@st.cache_resource(show_spinner=False)
|
| 118 |
def get_model_resources():
|
| 119 |
+
"""Memuat model dan tokenizer IndoBERT."""
|
|
|
|
|
|
|
|
|
|
| 120 |
return load_model_and_tokenizer()
|
| 121 |
|
| 122 |
|
|
|
|
| 123 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 124 |
model, tokenizer, le, device = get_model_resources()
|
| 125 |
|
|
|
|
| 126 |
success_placeholder = st.empty()
|
| 127 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 128 |
time.sleep(1)
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
def convert_df_to_excel(df):
|
| 133 |
+
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream."""
|
| 134 |
output = BytesIO()
|
| 135 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 136 |
df.to_excel(writer, index=False)
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
def clear_memory():
|
| 141 |
+
"""Clear memory cache"""
|
| 142 |
gc.collect()
|
| 143 |
if torch.cuda.is_available():
|
| 144 |
torch.cuda.empty_cache()
|
|
|
|
| 146 |
|
| 147 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 148 |
"""
|
| 149 |
+
Memproses satu chunk data dengan batch processing.
|
| 150 |
+
Progress bar: Preprocessing 0-100%, lalu Predicting 0-100%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
"""
|
| 152 |
# STEP 1: Preprocessing (0-100%)
|
| 153 |
cleaned_text_list = []
|
|
|
|
| 157 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 158 |
cleaned_text_list.append(clean_text)
|
| 159 |
|
|
|
|
| 160 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 161 |
progress = (idx + 1) / total_rows
|
| 162 |
progress_bar.progress(progress)
|
|
|
|
| 174 |
num_sents = len(cleaned_text_list)
|
| 175 |
num_asps = len(ASPEK_COLUMNS)
|
| 176 |
|
|
|
|
| 177 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 178 |
tokenizer, CONFIG["max_len"])
|
| 179 |
dl = DataLoader(
|
|
|
|
| 183 |
num_workers=0
|
| 184 |
)
|
| 185 |
|
|
|
|
| 186 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 187 |
|
| 188 |
batch_counter = 0
|
| 189 |
total_batch_count = len(dl)
|
| 190 |
|
|
|
|
| 191 |
model.eval()
|
| 192 |
with torch.no_grad():
|
| 193 |
for batch_data in dl:
|
|
|
|
| 196 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 197 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 198 |
|
|
|
|
| 199 |
model_outputs = model(inp_ids, attn_mask)
|
| 200 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 201 |
predicted_indices = torch.argmax(
|
| 202 |
probabilities, dim=1).cpu().numpy()
|
| 203 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 204 |
|
|
|
|
| 205 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 206 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 207 |
|
|
|
|
| 208 |
batch_counter += 1
|
| 209 |
progress = batch_counter / total_batch_count
|
| 210 |
progress_bar.progress(progress)
|
|
|
|
| 216 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 217 |
row_dict = data_row.to_dict()
|
| 218 |
row_dict["kritik_saran"] = cleaned_text_list[idx]
|
|
|
|
| 219 |
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
|
| 220 |
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
|
| 221 |
result_list.append(row_dict)
|
| 222 |
|
| 223 |
result_dataframe = pd.DataFrame(result_list)
|
| 224 |
|
|
|
|
| 225 |
chunks_directory = get_session_chunks_dir()
|
| 226 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 227 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
|
|
|
| 230 |
progress_bar.progress(1.0)
|
| 231 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 232 |
|
|
|
|
| 233 |
clear_memory()
|
| 234 |
|
| 235 |
return result_dataframe
|
| 236 |
|
| 237 |
|
| 238 |
def get_available_columns(df):
|
| 239 |
+
"""Deteksi kolom-kolom yang tersedia dalam dataframe"""
|
|
|
|
|
|
|
|
|
|
| 240 |
available = {
|
| 241 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 242 |
'has_semester': 'semester' in df.columns,
|
|
|
|
| 246 |
return available
|
| 247 |
|
| 248 |
|
|
|
|
|
|
|
| 249 |
# Judul aplikasi
|
| 250 |
st.markdown("""
|
| 251 |
<h1 class='title-center'>ABSA IndoBERT</h1>
|
|
|
|
| 257 |
st.markdown(" ")
|
| 258 |
st.markdown(" ")
|
| 259 |
|
| 260 |
+
# Panduan pengunaan
|
| 261 |
steps = [
|
| 262 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 263 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
|
|
| 269 |
"description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
|
| 270 |
]
|
| 271 |
|
|
|
|
| 272 |
cols = st.columns(len(steps))
|
| 273 |
|
| 274 |
for i, step in enumerate(steps):
|
|
|
|
| 284 |
st.markdown("")
|
| 285 |
st.markdown("")
|
| 286 |
|
| 287 |
+
# Upload file
|
| 288 |
uploaded_file = st.file_uploader(
|
| 289 |
" Upload Data Kritik & Saran",
|
| 290 |
type=["xlsx"],
|
| 291 |
help="File maksimal 200MB dengan format .xlsx"
|
| 292 |
)
|
| 293 |
|
| 294 |
+
# Clear cache buttons - SESSION SPECIFIC
|
| 295 |
session_cache_dir = get_session_cache_dir()
|
| 296 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 297 |
session_chunks_dir = get_session_chunks_dir()
|
| 298 |
|
|
|
|
| 299 |
if session_result_file.exists():
|
| 300 |
if st.button("Hapus Cache Data"):
|
| 301 |
session_result_file.unlink()
|
|
|
|
| 303 |
time.sleep(1)
|
| 304 |
st.rerun()
|
| 305 |
|
|
|
|
| 306 |
if session_chunks_dir.exists():
|
| 307 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 308 |
if chunk_files:
|
|
|
|
| 314 |
time.sleep(1)
|
| 315 |
st.rerun()
|
| 316 |
|
|
|
|
| 317 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 318 |
if not uploaded_file:
|
| 319 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
|
|
| 334 |
else:
|
| 335 |
st.caption(" ")
|
| 336 |
|
| 337 |
+
|
| 338 |
if "df_predicted" not in st.session_state:
|
| 339 |
st.session_state.df_predicted = None
|
| 340 |
|
|
|
|
| 341 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 342 |
try:
|
| 343 |
df_cached = pd.read_csv(session_result_file)
|
|
|
|
| 344 |
if "tahun" in df_cached.columns:
|
| 345 |
df_cached["tahun"] = pd.to_numeric(
|
| 346 |
df_cached["tahun"], errors='coerce').astype('Int64')
|
|
|
|
| 350 |
st.warning(f"Gagal memuat cache: {e}")
|
| 351 |
|
| 352 |
|
|
|
|
| 353 |
if uploaded_file:
|
| 354 |
file_bytes = uploaded_file.getvalue()
|
|
|
|
|
|
|
| 355 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 356 |
st.session_state.last_uploaded_file = file_bytes
|
| 357 |
st.session_state.uploaded_filename = uploaded_file.name
|
|
|
|
| 358 |
try:
|
|
|
|
| 359 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 360 |
|
|
|
|
| 361 |
if "tahun" in df_uploaded.columns:
|
| 362 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 363 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
|
|
| 365 |
except ValueError as err:
|
| 366 |
st.error(f"Gagal membaca file: {err}")
|
| 367 |
else:
|
|
|
|
| 368 |
if "kritik_saran" not in df_uploaded.columns:
|
| 369 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 370 |
else:
|
|
|
|
| 371 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 372 |
subset=["kritik_saran"])
|
|
|
|
|
|
|
| 373 |
for aspect_col in ASPEK_COLUMNS:
|
| 374 |
if aspect_col not in df_uploaded.columns:
|
| 375 |
df_uploaded[aspect_col] = None
|
|
|
|
| 379 |
total_rows = len(df_uploaded)
|
| 380 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 381 |
|
|
|
|
| 382 |
if use_chunked:
|
| 383 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 384 |
|
|
|
|
| 385 |
info_col1, info_col2, info_col3 = st.columns(3)
|
| 386 |
with info_col1:
|
| 387 |
st.info(f"**Total data:** {total_rows:,} rows")
|
|
|
|
| 398 |
chunk_status_text = st.empty()
|
| 399 |
overall_status = st.empty()
|
| 400 |
|
|
|
|
| 401 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 402 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 403 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
|
|
| 406 |
current_chunk_file = session_chunks_dir / \
|
| 407 |
f"chunk_{current_chunk_number}.csv"
|
| 408 |
|
|
|
|
| 409 |
if current_chunk_file.exists():
|
| 410 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 411 |
all_chunk_results.append(chunk_result)
|
|
|
|
| 424 |
time.sleep(0.3)
|
| 425 |
continue
|
| 426 |
|
|
|
|
| 427 |
chunk_progress_bar.progress(0)
|
| 428 |
|
| 429 |
chunk_result = process_chunk_batch(
|
|
|
|
| 432 |
)
|
| 433 |
all_chunk_results.append(chunk_result)
|
| 434 |
|
|
|
|
| 435 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 436 |
progress_pct = (processed / total_rows) * 100
|
| 437 |
elapsed = time.time() - start_time
|
|
|
|
| 446 |
|
| 447 |
time.sleep(0.3)
|
| 448 |
|
|
|
|
| 449 |
chunk_status_text.empty()
|
| 450 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 451 |
df_session = pd.concat(
|
|
|
|
| 455 |
end_time = time.time()
|
| 456 |
duration = end_time - start_time
|
| 457 |
|
|
|
|
| 458 |
else:
|
| 459 |
st.info(
|
| 460 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
|
|
|
| 464 |
progress_bar = st.progress(0)
|
| 465 |
status_text = st.empty()
|
| 466 |
|
|
|
|
| 467 |
cleaned_text_list = []
|
| 468 |
total_preprocessing = len(df_uploaded)
|
| 469 |
|
|
|
|
| 481 |
status_text.text("Memulai prediksi...")
|
| 482 |
time.sleep(0.3)
|
| 483 |
|
|
|
|
| 484 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 485 |
num_sents = len(cleaned_text_list)
|
| 486 |
num_asps = len(ASPEK_COLUMNS)
|
|
|
|
| 520 |
status_text.text(
|
| 521 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 522 |
|
|
|
|
| 523 |
result_list = []
|
| 524 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 525 |
row_dict = data_row.to_dict()
|
|
|
|
| 539 |
end_time = time.time()
|
| 540 |
duration = end_time - start_time
|
| 541 |
|
|
|
|
| 542 |
st.session_state.df_predicted = df_session
|
| 543 |
df_session.to_csv(session_result_file, index=False)
|
| 544 |
|
|
|
|
| 545 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 546 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 547 |
f.write(uploaded_file.name)
|
| 548 |
|
|
|
|
| 549 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 550 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 551 |
|
|
|
|
| 552 |
if use_chunked:
|
| 553 |
st.success(
|
| 554 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
|
|
| 568 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 569 |
)
|
| 570 |
|
| 571 |
+
# Setelah prediksi selesai
|
| 572 |
if st.session_state.df_predicted is not None:
|
| 573 |
df_predicted = st.session_state.df_predicted
|
| 574 |
|
| 575 |
+
# Deteksi kolom yang tersedia
|
| 576 |
available_cols = get_available_columns(df_predicted)
|
| 577 |
|
| 578 |
+
# Sidebar filter dengan pengecekan kolom dinamis
|
| 579 |
st.sidebar.header("Filter Data")
|
| 580 |
|
| 581 |
df_clean = df_predicted.copy()
|
|
|
|
| 587 |
st.sidebar.info(
|
| 588 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 589 |
|
| 590 |
+
# Filter Mata Kuliah
|
| 591 |
selected_matkul = []
|
| 592 |
if available_cols['has_matkul']:
|
| 593 |
matkul_options = sorted(
|
|
|
|
| 596 |
selected_matkul = st.sidebar.multiselect(
|
| 597 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 598 |
|
| 599 |
+
# Filter Program Studi
|
| 600 |
selected_prodi = []
|
| 601 |
if available_cols['has_prodi']:
|
| 602 |
prodi_options = sorted(
|
|
|
|
| 605 |
selected_prodi = st.sidebar.multiselect(
|
| 606 |
"Program Studi", prodi_options, default=prodi_options)
|
| 607 |
|
| 608 |
+
# Filter Tahun
|
| 609 |
selected_tahun = []
|
| 610 |
if available_cols['has_tahun']:
|
|
|
|
| 611 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
| 612 |
df_clean['tahun'] = pd.to_datetime(
|
| 613 |
df_clean['tanggal'], errors='coerce').dt.year
|
|
|
|
| 618 |
selected_tahun = st.sidebar.multiselect(
|
| 619 |
"Tahun", tahun_options, default=tahun_options)
|
| 620 |
|
| 621 |
+
# Filter Semester
|
| 622 |
selected_semester = []
|
| 623 |
if available_cols['has_semester']:
|
| 624 |
semester_options = sorted(
|
|
|
|
| 627 |
selected_semester = st.sidebar.multiselect(
|
| 628 |
"Semester", semester_options, default=semester_options)
|
| 629 |
|
| 630 |
+
# Apply filters
|
| 631 |
df_filtered = df_clean.copy()
|
| 632 |
|
| 633 |
if selected_matkul and available_cols['has_matkul']:
|
|
|
|
| 636 |
|
| 637 |
if selected_prodi and available_cols['has_prodi']:
|
| 638 |
df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
|
| 639 |
+
selected_prodi)]
|
| 640 |
+
|
| 641 |
+
if selected_tahun and available_cols['has_tahun']:
|
| 642 |
+
df_filtered = df_filtered[df_filtered["tahun"].isin(selected_tahun)]
|
| 643 |
+
|
| 644 |
+
if selected_semester and available_cols['has_semester']:
|
| 645 |
+
df_filtered = df_filtered[df_filtered["semester"].isin(
|
| 646 |
+
selected_semester)]
|
| 647 |
+
|
| 648 |
+
# Tampilkan tabel hasil prediksi
|
| 649 |
+
st.markdown("### Tabel Data Hasil Prediksi")
|
| 650 |
+
st.dataframe(df_filtered, width='stretch')
|
| 651 |
+
|
| 652 |
+
# Download buttons
|
| 653 |
+
col_dl1, col_dl2 = st.columns(2)
|
| 654 |
+
with col_dl1:
|
| 655 |
+
st.download_button(
|
| 656 |
+
label="Unduh Data Terfilter",
|
| 657 |
+
data=convert_df_to_excel(df_filtered),
|
| 658 |
+
file_name="hasil_prediksi_absa_filtered.xlsx",
|
| 659 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 660 |
+
use_container_width=True
|
| 661 |
+
)
|
| 662 |
+
|
| 663 |
+
with col_dl2:
|
| 664 |
+
st.download_button(
|
| 665 |
+
label="Unduh Semua Data",
|
| 666 |
+
data=convert_df_to_excel(df_predicted),
|
| 667 |
+
file_name="hasil_prediksi_absa_all.xlsx",
|
| 668 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 669 |
+
use_container_width=True
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
st.info(
|
| 673 |
+
f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
|
| 674 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 675 |
|
| 676 |
+
# Ringkasan Cepat
|
| 677 |
+
st.markdown("")
|
| 678 |
+
st.markdown("### Ringkasan Cepat")
|
| 679 |
+
st.markdown("")
|
| 680 |
|
| 681 |
+
total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
|
| 682 |
+
total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
|
| 683 |
+
total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
|
| 684 |
|
| 685 |
+
# Hitung jumlah kolom yang tersedia untuk ringkasan
|
| 686 |
+
summary_cols = []
|
|
|
|
|
|
|
|
|
|
| 687 |
|
| 688 |
+
# Kolom dasar (selalu ada)
|
| 689 |
+
summary_cols.extend(['ulasan', 'aspek'])
|
| 690 |
+
|
| 691 |
+
# Kolom opsional
|
| 692 |
+
if available_cols['has_matkul']:
|
| 693 |
+
summary_cols.append('matkul')
|
| 694 |
+
if available_cols['has_prodi']:
|
| 695 |
+
summary_cols.append('prodi')
|
| 696 |
+
if available_cols['has_semester']:
|
| 697 |
+
summary_cols.append('semester')
|
| 698 |
|
| 699 |
+
# Buat kolom dinamis berdasarkan data yang tersedia
|
| 700 |
+
num_cols = len(summary_cols)
|
| 701 |
+
cols = st.columns(num_cols)
|
|
|
|
|
|
|
| 702 |
|
| 703 |
+
col_idx = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
|
| 705 |
+
# Ulasan & Aspek (selalu ada)
|
| 706 |
+
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
|
| 707 |
+
col_idx += 1
|
| 708 |
+
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 709 |
+
col_idx += 1
|
| 710 |
|
| 711 |
+
# Mata Kuliah (jika ada)
|
| 712 |
+
if available_cols['has_matkul']:
|
| 713 |
+
matkul_count = df_filtered['nama_matakuliah'].nunique()
|
| 714 |
+
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
|
| 715 |
+
col_idx += 1
|
| 716 |
|
| 717 |
+
# Prodi (jika ada)
|
| 718 |
+
if available_cols['has_prodi']:
|
| 719 |
+
prodi_count = df_filtered['nama_prodi'].nunique()
|
| 720 |
+
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 721 |
+
col_idx += 1
|
| 722 |
+
|
| 723 |
+
# Semester (jika ada)
|
| 724 |
+
if available_cols['has_semester']:
|
| 725 |
+
semester_count = df_filtered['semester'].nunique()
|
| 726 |
+
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
|
| 727 |
+
col_idx += 1
|
| 728 |
+
|
| 729 |
+
st.markdown("")
|
| 730 |
+
|
| 731 |
+
# Baris kedua: Sentimen + info tambahan
|
| 732 |
+
summary_cols2 = ['positif', 'netral', 'negatif']
|
| 733 |
+
|
| 734 |
+
if available_cols['has_tahun']:
|
| 735 |
+
summary_cols2.append('tahun')
|
| 736 |
+
if 'kritik_saran' in df_filtered.columns:
|
| 737 |
+
summary_cols2.append('kata')
|
| 738 |
+
|
| 739 |
+
cols2 = st.columns(len(summary_cols2))
|
| 740 |
+
|
| 741 |
+
col_idx2 = 0
|
| 742 |
+
cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
|
| 743 |
+
col_idx2 += 1
|
| 744 |
+
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
|
| 745 |
+
col_idx2 += 1
|
| 746 |
+
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
|
| 747 |
+
col_idx2 += 1
|
| 748 |
+
|
| 749 |
+
# Rentang tahun (jika ada)
|
| 750 |
+
if available_cols['has_tahun']:
|
| 751 |
+
if 'tahun' in df_filtered.columns:
|
| 752 |
+
tahun_valid = df_filtered['tahun'].dropna()
|
| 753 |
if len(tahun_valid) > 0:
|
| 754 |
+
tahun_min = int(tahun_valid.min())
|
| 755 |
+
tahun_max = int(tahun_valid.max())
|
| 756 |
if tahun_min == tahun_max:
|
| 757 |
cols2[col_idx2].metric("Tahun", f"{tahun_min}")
|
| 758 |
else:
|
|
|
|
| 760 |
"Rentang Tahun", f"{tahun_min} - {tahun_max}")
|
| 761 |
else:
|
| 762 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 763 |
+
else:
|
| 764 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 765 |
+
col_idx2 += 1
|
| 766 |
|
| 767 |
+
# Rata-rata panjang kata (jika ada)
|
| 768 |
+
if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
|
| 769 |
+
try:
|
| 770 |
+
word_counts = df_filtered['kritik_saran'].astype(
|
| 771 |
str).str.split().str.len()
|
| 772 |
+
avg_word_count = round(word_counts.mean(), 1)
|
| 773 |
cols2[col_idx2].metric(
|
| 774 |
"Rata-rata Panjang Kata", f"{avg_word_count} kata")
|
| 775 |
+
except Exception:
|
| 776 |
cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
|
| 777 |
|
| 778 |
+
st.markdown("---")
|
| 779 |
+
st.markdown("### Visualisasi Data")
|
|
|
|
| 780 |
|
| 781 |
+
# Visualisasi Sentimen (selalu ditampilkan)
|
| 782 |
+
col1, col2 = st.columns(2)
|
| 783 |
+
with col1:
|
| 784 |
+
show_sentiment_bar_chart(df_filtered, ASPEK_COLUMNS)
|
| 785 |
+
with col2:
|
| 786 |
+
show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
|
| 787 |
|
| 788 |
+
# Visualisasi berdasarkan kolom yang tersedia
|
| 789 |
+
viz_shown = False
|
| 790 |
|
| 791 |
+
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 792 |
+
col1, col2 = st.columns(2)
|
| 793 |
+
with col1:
|
|
|
|
| 794 |
if available_cols['has_tahun']:
|
| 795 |
+
result = show_year_distribution(df_filtered)
|
| 796 |
if result:
|
| 797 |
+
viz_shown = True
|
| 798 |
+
with col2:
|
| 799 |
if available_cols['has_semester']:
|
| 800 |
+
result = show_semester_distribution(df_filtered)
|
| 801 |
if result:
|
| 802 |
+
viz_shown = True
|
| 803 |
+
|
| 804 |
+
if available_cols['has_prodi']:
|
| 805 |
+
st.markdown("---")
|
| 806 |
+
result = show_prodi_distribution(df_filtered)
|
| 807 |
+
if result:
|
| 808 |
+
viz_shown = True
|
| 809 |
+
|
| 810 |
+
if available_cols['has_matkul']:
|
| 811 |
+
st.markdown("---")
|
| 812 |
+
result = show_top10_matkul_distribution(df_filtered)
|
| 813 |
+
if result:
|
| 814 |
+
viz_shown = True
|
| 815 |
+
|
| 816 |
+
# Sentimen per tahun/semester
|
| 817 |
+
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 818 |
+
st.markdown("---")
|
| 819 |
+
col1, col2 = st.columns(2)
|
| 820 |
+
with col1:
|
|
|
|
|
|
|
| 821 |
if available_cols['has_tahun']:
|
| 822 |
+
result = show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
|
| 823 |
if result:
|
| 824 |
+
viz_shown = True
|
| 825 |
+
with col2:
|
| 826 |
if available_cols['has_semester']:
|
| 827 |
+
result = show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
|
| 828 |
if result:
|
| 829 |
+
viz_shown = True
|
| 830 |
+
|
| 831 |
+
if available_cols['has_prodi']:
|
| 832 |
+
st.markdown("---")
|
| 833 |
+
result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
|
| 834 |
+
if result:
|
| 835 |
+
viz_shown = True
|
| 836 |
+
|
| 837 |
+
if available_cols['has_matkul']:
|
| 838 |
+
st.markdown("---")
|
| 839 |
+
result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
|
| 840 |
+
if result:
|
| 841 |
+
viz_shown = True
|
| 842 |
+
|
| 843 |
+
# Footer
|
| 844 |
+
st.caption("""
|
|
|
|
|
|
|
| 845 |
<div class='footer'>
|
| 846 |
+
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
| 847 |
</div>
|
| 848 |
""", unsafe_allow_html=True)
|
model_utils.py
CHANGED
|
@@ -17,7 +17,7 @@ except ImportError:
|
|
| 17 |
subprocess.check_call(['pip', 'install', 'scikit-learn'])
|
| 18 |
from sklearn.preprocessing import LabelEncoder
|
| 19 |
|
| 20 |
-
|
| 21 |
class ABSADataset(Dataset):
|
| 22 |
"""
|
| 23 |
Custom Dataset untuk ABSA batch processing.
|
|
@@ -294,4 +294,87 @@ def predict_multi_aspect(model, tokenizer, sentence, aspek_list, label_encoder,
|
|
| 294 |
# Prediksi tanpa menghitung gradient (inference mode)
|
| 295 |
with torch.no_grad():
|
| 296 |
# Forward pass
|
| 297 |
-
outputs = model(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
subprocess.check_call(['pip', 'install', 'scikit-learn'])
|
| 18 |
from sklearn.preprocessing import LabelEncoder
|
| 19 |
|
| 20 |
+
|
| 21 |
class ABSADataset(Dataset):
|
| 22 |
"""
|
| 23 |
Custom Dataset untuk ABSA batch processing.
|
|
|
|
| 294 |
# Prediksi tanpa menghitung gradient (inference mode)
|
| 295 |
with torch.no_grad():
|
| 296 |
# Forward pass
|
| 297 |
+
outputs = model(input_ids, attention_mask)
|
| 298 |
+
# Konversi logits ke probabilitas dengan softmax
|
| 299 |
+
probs = F.softmax(outputs, dim=1).squeeze()
|
| 300 |
+
# Ambil indeks dengan probabilitas tertinggi
|
| 301 |
+
idx = torch.argmax(probs).item()
|
| 302 |
+
# Konversi indeks ke label sentimen
|
| 303 |
+
label = label_encoder.inverse_transform([idx])[0]
|
| 304 |
+
# Simpan hasil
|
| 305 |
+
results[aspek] = label
|
| 306 |
+
|
| 307 |
+
return results
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def predict_multi_aspect_batch(model, tokenizer, sentences, aspek_list, label_encoder, device, max_len, batch_size=None):
|
| 311 |
+
"""
|
| 312 |
+
Melakukan prediksi sentimen untuk setiap aspek pada multiple kalimat menggunakan batch processing.
|
| 313 |
+
Lebih efisien untuk memproses banyak kalimat sekaligus.
|
| 314 |
+
|
| 315 |
+
Args:
|
| 316 |
+
model (nn.Module): Model ABSA yang sudah diload.
|
| 317 |
+
tokenizer (AutoTokenizer): Tokenizer IndoBERT.
|
| 318 |
+
sentences (list): List kalimat input.
|
| 319 |
+
aspek_list (list): Daftar aspek yang ingin diprediksi.
|
| 320 |
+
label_encoder (LabelEncoder): Encoder label.
|
| 321 |
+
device (torch.device): Device (cuda/cpu).
|
| 322 |
+
max_len (int): Panjang maksimum token.
|
| 323 |
+
batch_size (int, optional): Ukuran batch. Jika None, gunakan dari CONFIG.
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
list: List of dict hasil prediksi [{aspek: label_sentimen}, ...].
|
| 327 |
+
"""
|
| 328 |
+
# Set batch size dari CONFIG jika tidak diberikan
|
| 329 |
+
if batch_size is None:
|
| 330 |
+
batch_size = CONFIG.get("batch_size", 32)
|
| 331 |
+
|
| 332 |
+
# === BUAT DATASET DAN DATALOADER ===
|
| 333 |
+
# Dataset akan membuat kombinasi semua kalimat × semua aspek
|
| 334 |
+
dataset = ABSADataset(sentences, aspek_list, tokenizer, max_len)
|
| 335 |
+
dataloader = DataLoader(
|
| 336 |
+
dataset,
|
| 337 |
+
batch_size=batch_size, # Process dalam batch untuk efisiensi
|
| 338 |
+
shuffle=False, # Jangan shuffle untuk maintain urutan
|
| 339 |
+
num_workers=CONFIG.get("num_workers", 0)
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
# === INISIALISASI CONTAINER HASIL ===
|
| 343 |
+
num_sentences = len(sentences)
|
| 344 |
+
num_aspects = len(aspek_list)
|
| 345 |
+
# Buat matrix untuk menyimpan prediksi [num_sentences x num_aspects]
|
| 346 |
+
all_predictions = [[None] * num_aspects for _ in range(num_sentences)]
|
| 347 |
+
|
| 348 |
+
# === BATCH PREDICTION ===
|
| 349 |
+
model.eval() # Set model ke evaluation mode
|
| 350 |
+
with torch.no_grad(): # Nonaktifkan gradient calculation
|
| 351 |
+
for batch in dataloader:
|
| 352 |
+
# Pindahkan batch ke device
|
| 353 |
+
input_ids = batch['input_ids'].to(device)
|
| 354 |
+
attention_mask = batch['attention_mask'].to(device)
|
| 355 |
+
sent_indices = batch['sent_idx'].numpy()
|
| 356 |
+
aspect_indices = batch['aspect_idx'].numpy()
|
| 357 |
+
|
| 358 |
+
# Forward pass untuk seluruh batch
|
| 359 |
+
outputs = model(input_ids, attention_mask)
|
| 360 |
+
# Konversi logits ke probabilitas
|
| 361 |
+
probs = F.softmax(outputs, dim=1)
|
| 362 |
+
# Ambil indeks prediksi tertinggi
|
| 363 |
+
pred_indices = torch.argmax(probs, dim=1).cpu().numpy()
|
| 364 |
+
|
| 365 |
+
# Konversi indeks ke label sentimen
|
| 366 |
+
labels = label_encoder.inverse_transform(pred_indices)
|
| 367 |
+
|
| 368 |
+
# Simpan hasil ke matrix sesuai indeks aslinya
|
| 369 |
+
for i, (sent_idx, aspect_idx, label) in enumerate(zip(sent_indices, aspect_indices, labels)):
|
| 370 |
+
all_predictions[sent_idx][aspect_idx] = label
|
| 371 |
+
|
| 372 |
+
# === KONVERSI KE FORMAT DICTIONARY ===
|
| 373 |
+
results = []
|
| 374 |
+
for predictions in all_predictions:
|
| 375 |
+
# Buat dict {aspek: label} untuk setiap kalimat
|
| 376 |
+
result_dict = {aspek: label for aspek,
|
| 377 |
+
label in zip(aspek_list, predictions)}
|
| 378 |
+
results.append(result_dict)
|
| 379 |
+
|
| 380 |
+
return results
|
visualization.py
CHANGED
|
@@ -13,241 +13,135 @@ import plotly.express as px
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
-
# Palet warna kustom
|
| 17 |
sentimen_palette = {
|
| 18 |
-
"netral": "#FFE24C",
|
| 19 |
-
"positif": "#4CFF72",
|
| 20 |
-
"negatif": "#FF4C4C"
|
| 21 |
}
|
| 22 |
-
|
| 23 |
-
# Urutan kategori sentimen untuk konsistensi visualisasi
|
| 24 |
category_order = ["netral", "positif", "negatif"]
|
| 25 |
|
| 26 |
-
# Konfigurasi Plotly
|
| 27 |
config_options = {
|
| 28 |
-
"scrollZoom": False,
|
| 29 |
-
"displayModeBar": False
|
| 30 |
}
|
| 31 |
|
| 32 |
|
| 33 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 34 |
-
"""
|
| 35 |
-
Menampilkan bar chart distribusi sentimen per aspek.
|
| 36 |
-
Chart menampilkan jumlah setiap sentimen (positif/netral/negatif) untuk setiap aspek.
|
| 37 |
-
|
| 38 |
-
Args:
|
| 39 |
-
df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
|
| 40 |
-
aspek_columns (list): List nama kolom aspek yang akan divisualisasikan
|
| 41 |
-
"""
|
| 42 |
-
# Validasi: cek apakah data dan kolom aspek tersedia
|
| 43 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 44 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 45 |
return
|
| 46 |
|
| 47 |
-
# Transform data dari wide format ke long format untuk visualisasi
|
| 48 |
df_long = df_predicted.melt(
|
| 49 |
value_vars=aspek_columns,
|
| 50 |
var_name="aspek",
|
| 51 |
value_name="sentimen"
|
| 52 |
)
|
| 53 |
-
|
| 54 |
-
# Konversi sentimen ke categorical untuk sorting yang konsisten
|
| 55 |
df_long["sentimen"] = pd.Categorical(
|
| 56 |
df_long["sentimen"],
|
| 57 |
categories=category_order,
|
| 58 |
ordered=True
|
| 59 |
)
|
| 60 |
-
|
| 61 |
-
# Hitung jumlah setiap kombinasi aspek-sentimen
|
| 62 |
count_data = df_long.groupby(
|
| 63 |
["aspek", "sentimen"], observed=False
|
| 64 |
).size().reset_index(name="jumlah")
|
| 65 |
-
|
| 66 |
-
# Buat bar chart dengan Plotly
|
| 67 |
fig = px.bar(
|
| 68 |
count_data,
|
| 69 |
x="aspek",
|
| 70 |
y="jumlah",
|
| 71 |
color="sentimen",
|
| 72 |
-
barmode="group",
|
| 73 |
color_discrete_map=sentimen_palette,
|
| 74 |
category_orders={"sentimen": category_order}
|
| 75 |
)
|
| 76 |
fig.update_layout(title="Distribusi Sentimen per Aspek")
|
| 77 |
-
|
| 78 |
-
# Tampilkan chart di Streamlit
|
| 79 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 80 |
|
| 81 |
|
| 82 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 83 |
-
"""
|
| 84 |
-
Menampilkan pie chart distribusi total sentimen dari semua aspek.
|
| 85 |
-
Chart menampilkan proporsi keseluruhan sentimen dalam bentuk donut chart.
|
| 86 |
-
|
| 87 |
-
Args:
|
| 88 |
-
df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
|
| 89 |
-
aspek_columns (list): List nama kolom aspek
|
| 90 |
-
"""
|
| 91 |
-
# Flatten semua nilai sentimen dari semua aspek menjadi satu array
|
| 92 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
| 93 |
-
|
| 94 |
-
# Hitung frekuensi setiap sentimen
|
| 95 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 96 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 97 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
sentimen_counts,
|
| 102 |
-
names="sentimen",
|
| 103 |
-
values="jumlah",
|
| 104 |
-
color="sentimen",
|
| 105 |
-
color_discrete_map=sentimen_palette,
|
| 106 |
-
hole=0.3 # Buat donut chart
|
| 107 |
-
)
|
| 108 |
fig.update_layout(title="Total Komposisi Sentimen")
|
| 109 |
-
|
| 110 |
-
# Tampilkan persentase dan label di dalam chart
|
| 111 |
fig.update_traces(textposition='inside', textinfo='percent+label')
|
| 112 |
-
|
| 113 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 114 |
|
| 115 |
|
| 116 |
def show_year_distribution(df):
|
| 117 |
-
"""
|
| 118 |
-
|
| 119 |
-
Jika kolom 'tahun' tidak ada, akan mencoba ekstrak dari kolom 'tanggal'.
|
| 120 |
-
|
| 121 |
-
Args:
|
| 122 |
-
df (pd.DataFrame): DataFrame input
|
| 123 |
-
|
| 124 |
-
Returns:
|
| 125 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 126 |
-
"""
|
| 127 |
-
# Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
|
| 128 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 129 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 130 |
|
| 131 |
-
# Validasi: return None jika tidak ada kolom tahun
|
| 132 |
if 'tahun' not in df.columns:
|
| 133 |
-
return None
|
| 134 |
|
| 135 |
-
# Filter data yang memiliki nilai tahun valid
|
| 136 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 137 |
if df_tahun.empty:
|
| 138 |
return None
|
| 139 |
|
| 140 |
-
# Konversi tahun ke integer
|
| 141 |
df_tahun['tahun'] = df_tahun['tahun'].astype(int)
|
| 142 |
-
|
| 143 |
-
# Hitung frekuensi per tahun
|
| 144 |
year_counts = df_tahun['tahun'].value_counts().reset_index()
|
| 145 |
year_counts.columns = ['tahun', 'jumlah']
|
| 146 |
year_counts = year_counts.sort_values('jumlah', ascending=False)
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
year_counts,
|
| 151 |
-
x='tahun',
|
| 152 |
-
y='jumlah',
|
| 153 |
-
color='tahun',
|
| 154 |
-
title="Distribusi Kritik/Saran per Tahun"
|
| 155 |
-
)
|
| 156 |
-
# Treat tahun sebagai kategori untuk menghindari interpolasi
|
| 157 |
fig.update_layout(xaxis=dict(type='category'))
|
| 158 |
-
|
| 159 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 160 |
return True
|
| 161 |
|
| 162 |
|
| 163 |
def show_semester_distribution(df):
|
| 164 |
-
"""
|
| 165 |
-
Menampilkan distribusi jumlah kritik/saran per semester.
|
| 166 |
-
|
| 167 |
-
Args:
|
| 168 |
-
df (pd.DataFrame): DataFrame input
|
| 169 |
-
|
| 170 |
-
Returns:
|
| 171 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 172 |
-
"""
|
| 173 |
-
# Validasi: cek apakah kolom semester ada
|
| 174 |
if 'semester' not in df.columns:
|
| 175 |
return None
|
| 176 |
|
| 177 |
-
# Hitung frekuensi per semester
|
| 178 |
semester_counts = df['semester'].value_counts().reset_index()
|
| 179 |
semester_counts.columns = ['semester', 'jumlah']
|
| 180 |
semester_counts = semester_counts.sort_values('jumlah', ascending=False)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
fig = px.bar(
|
| 184 |
-
semester_counts,
|
| 185 |
-
x='semester',
|
| 186 |
-
y='jumlah',
|
| 187 |
-
color='semester',
|
| 188 |
-
title="Distribusi Kritik/Saran per Semester"
|
| 189 |
-
)
|
| 190 |
-
# Sort berdasarkan total descending
|
| 191 |
fig.update_layout(xaxis=dict(categoryorder='total descending'))
|
| 192 |
-
|
| 193 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 194 |
return True
|
| 195 |
|
| 196 |
|
| 197 |
def show_prodi_distribution(df):
|
| 198 |
-
"""
|
| 199 |
-
Menampilkan jumlah kritik/saran per program studi dalam bentuk horizontal bar chart.
|
| 200 |
-
|
| 201 |
-
Args:
|
| 202 |
-
df (pd.DataFrame): DataFrame input
|
| 203 |
-
|
| 204 |
-
Returns:
|
| 205 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 206 |
-
"""
|
| 207 |
-
# Validasi: cek apakah kolom nama_prodi ada
|
| 208 |
if 'nama_prodi' not in df.columns:
|
| 209 |
return None
|
| 210 |
|
| 211 |
-
# Hitung frekuensi per program studi
|
| 212 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 213 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
| 214 |
-
|
| 215 |
-
# Sort ascending untuk horizontal bar (terbanyak di atas)
|
| 216 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
| 217 |
-
|
| 218 |
-
# Buat horizontal bar chart
|
| 219 |
fig = px.bar(
|
| 220 |
prodi_counts,
|
| 221 |
x='jumlah',
|
| 222 |
y='nama_prodi',
|
| 223 |
-
orientation='h',
|
| 224 |
color='jumlah',
|
| 225 |
title="Jumlah Kritik/Saran per Program Studi"
|
| 226 |
)
|
| 227 |
-
|
| 228 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 229 |
return True
|
| 230 |
|
| 231 |
|
| 232 |
def show_top10_matkul_distribution(df):
|
| 233 |
-
"""
|
| 234 |
-
Menampilkan 10 mata kuliah dengan jumlah kritik/saran terbanyak.
|
| 235 |
-
Format: [kode_matakuliah] - [nama_matakuliah]
|
| 236 |
-
|
| 237 |
-
Args:
|
| 238 |
-
df (pd.DataFrame): DataFrame input
|
| 239 |
-
|
| 240 |
-
Returns:
|
| 241 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 242 |
-
"""
|
| 243 |
-
# Validasi: cek apakah kolom yang diperlukan ada
|
| 244 |
required_cols = ['nama_matakuliah', 'kode_matakuliah']
|
| 245 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 246 |
|
| 247 |
if missing_cols:
|
| 248 |
return None
|
| 249 |
|
| 250 |
-
# Group by kode dan nama mata kuliah, ambil 10 teratas
|
| 251 |
matkul_counts = (
|
| 252 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 253 |
.size()
|
|
@@ -255,17 +149,12 @@ def show_top10_matkul_distribution(df):
|
|
| 255 |
.sort_values(by='jumlah', ascending=False)
|
| 256 |
.head(10)
|
| 257 |
)
|
| 258 |
-
|
| 259 |
-
# Buat label gabungan: "kode - nama"
|
| 260 |
matkul_counts['label'] = (
|
| 261 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 262 |
matkul_counts['nama_matakuliah']
|
| 263 |
)
|
| 264 |
-
|
| 265 |
-
# Sort ascending untuk horizontal bar (terbanyak di atas)
|
| 266 |
matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
|
| 267 |
|
| 268 |
-
# Buat horizontal bar chart
|
| 269 |
fig = px.bar(
|
| 270 |
matkul_counts,
|
| 271 |
x='jumlah',
|
|
@@ -274,124 +163,60 @@ def show_top10_matkul_distribution(df):
|
|
| 274 |
title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
|
| 275 |
color='jumlah'
|
| 276 |
)
|
| 277 |
-
|
| 278 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 279 |
return True
|
| 280 |
|
| 281 |
|
| 282 |
def show_sentiment_by_year(df, aspek_columns):
|
| 283 |
-
"""
|
| 284 |
-
|
| 285 |
-
Menunjukkan bagaimana sentimen berubah dari tahun ke tahun.
|
| 286 |
-
|
| 287 |
-
Args:
|
| 288 |
-
df (pd.DataFrame): DataFrame input
|
| 289 |
-
aspek_columns (list): List nama kolom aspek
|
| 290 |
-
|
| 291 |
-
Returns:
|
| 292 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 293 |
-
"""
|
| 294 |
-
# Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
|
| 295 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 296 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 297 |
|
| 298 |
-
# Validasi: return None jika tidak ada kolom tahun
|
| 299 |
if 'tahun' not in df.columns:
|
| 300 |
return None
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
var_name='aspek',
|
| 307 |
-
value_name='sentimen'
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
# Group by tahun dan sentimen, hitung frekuensi
|
| 311 |
year_sentiment = df_long.groupby(
|
| 312 |
['tahun', 'sentimen'], observed=False
|
| 313 |
).size().reset_index(name='jumlah')
|
| 314 |
-
|
| 315 |
year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
fig = px.bar(
|
| 319 |
-
year_sentiment,
|
| 320 |
-
x='tahun',
|
| 321 |
-
y='jumlah',
|
| 322 |
-
color='sentimen',
|
| 323 |
-
barmode='group', # Bars dikelompokkan per tahun
|
| 324 |
-
color_discrete_map=sentimen_palette
|
| 325 |
-
)
|
| 326 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
|
| 327 |
-
|
| 328 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 329 |
return True
|
| 330 |
|
| 331 |
|
| 332 |
def show_sentiment_by_semester(df, aspek_columns):
|
| 333 |
-
"""
|
| 334 |
-
Menampilkan distribusi sentimen per semester dalam bentuk grouped bar chart.
|
| 335 |
-
|
| 336 |
-
Args:
|
| 337 |
-
df (pd.DataFrame): DataFrame input
|
| 338 |
-
aspek_columns (list): List nama kolom aspek
|
| 339 |
-
|
| 340 |
-
Returns:
|
| 341 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 342 |
-
"""
|
| 343 |
-
# Validasi: cek apakah kolom semester ada
|
| 344 |
if 'semester' not in df.columns:
|
| 345 |
return None
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
var_name='aspek',
|
| 352 |
-
value_name='sentimen'
|
| 353 |
-
)
|
| 354 |
-
|
| 355 |
-
# Group by semester dan sentimen, hitung frekuensi
|
| 356 |
semester_sentiment = df_long.groupby(
|
| 357 |
['semester', 'sentimen'], observed=False
|
| 358 |
).size().reset_index(name='jumlah')
|
| 359 |
-
|
| 360 |
semester_sentiment = semester_sentiment.sort_values(
|
| 361 |
'jumlah', ascending=False)
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
fig = px.bar(
|
| 365 |
-
semester_sentiment,
|
| 366 |
-
x='semester',
|
| 367 |
-
y='jumlah',
|
| 368 |
-
color='sentimen',
|
| 369 |
-
barmode='group', # Bars dikelompokkan per semester
|
| 370 |
-
color_discrete_map=sentimen_palette
|
| 371 |
-
)
|
| 372 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
|
| 373 |
-
|
| 374 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 375 |
return True
|
| 376 |
|
| 377 |
|
| 378 |
def show_sentiment_by_prodi(df, aspek_columns):
|
| 379 |
-
"""
|
| 380 |
-
Menampilkan distribusi sentimen per program studi dalam horizontal grouped bar chart.
|
| 381 |
-
Program studi diurutkan berdasarkan total jumlah kritik/saran.
|
| 382 |
-
|
| 383 |
-
Args:
|
| 384 |
-
df (pd.DataFrame): DataFrame input
|
| 385 |
-
aspek_columns (list): List nama kolom aspek
|
| 386 |
-
|
| 387 |
-
Returns:
|
| 388 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 389 |
-
"""
|
| 390 |
-
# Validasi: cek apakah kolom nama_prodi ada
|
| 391 |
if 'nama_prodi' not in df.columns:
|
| 392 |
return None
|
| 393 |
|
| 394 |
-
# Transform data dari wide ke long format
|
| 395 |
df_long = df.melt(
|
| 396 |
id_vars=['nama_prodi'],
|
| 397 |
value_vars=aspek_columns,
|
|
@@ -399,72 +224,51 @@ def show_sentiment_by_prodi(df, aspek_columns):
|
|
| 399 |
value_name='sentimen'
|
| 400 |
)
|
| 401 |
|
| 402 |
-
# Group by prodi dan sentimen, hitung frekuensi
|
| 403 |
prodi_sentiment = (
|
| 404 |
df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
|
| 405 |
.size()
|
| 406 |
.reset_index(name='jumlah')
|
| 407 |
)
|
| 408 |
|
| 409 |
-
# Hitung total per prodi untuk sorting
|
| 410 |
total_per_prodi = (
|
| 411 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 412 |
.sum()
|
| 413 |
.sort_values(ascending=False)
|
| 414 |
)
|
| 415 |
-
|
| 416 |
-
# Reverse order untuk horizontal bar (terbanyak di atas)
|
| 417 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 418 |
|
| 419 |
-
# Konversi ke categorical untuk maintain order
|
| 420 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 421 |
prodi_sentiment['nama_prodi'],
|
| 422 |
categories=ordered_categories,
|
| 423 |
ordered=True
|
| 424 |
)
|
| 425 |
|
| 426 |
-
# Buat horizontal grouped bar chart
|
| 427 |
fig = px.bar(
|
| 428 |
prodi_sentiment,
|
| 429 |
y='nama_prodi',
|
| 430 |
x='jumlah',
|
| 431 |
color='sentimen',
|
| 432 |
barmode='group',
|
| 433 |
-
orientation='h',
|
| 434 |
color_discrete_map=sentimen_palette
|
| 435 |
)
|
| 436 |
fig.update_layout(
|
| 437 |
title="Distribusi Sentimen per Program Studi",
|
| 438 |
-
yaxis={
|
| 439 |
-
|
| 440 |
-
'categoryarray': ordered_categories
|
| 441 |
-
}
|
| 442 |
)
|
| 443 |
-
|
| 444 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 445 |
return True
|
| 446 |
|
| 447 |
|
| 448 |
def show_sentiment_by_top10_matkul(df, aspek_columns):
|
| 449 |
-
"""
|
| 450 |
-
Menampilkan distribusi sentimen pada 10 mata kuliah dengan kritik/saran terbanyak.
|
| 451 |
-
Chart menggunakan horizontal grouped bar, diurutkan berdasarkan total kritik/saran.
|
| 452 |
-
|
| 453 |
-
Args:
|
| 454 |
-
df (pd.DataFrame): DataFrame input
|
| 455 |
-
aspek_columns (list): List nama kolom aspek
|
| 456 |
-
|
| 457 |
-
Returns:
|
| 458 |
-
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 459 |
-
"""
|
| 460 |
-
# Validasi: cek apakah kolom yang diperlukan ada
|
| 461 |
required_cols = ['kode_matakuliah', 'nama_matakuliah']
|
| 462 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 463 |
|
| 464 |
if missing_cols:
|
| 465 |
return None
|
| 466 |
|
| 467 |
-
# Identifikasi top 10 mata kuliah berdasarkan jumlah kritik/saran
|
| 468 |
df_top10 = (
|
| 469 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 470 |
.size()
|
|
@@ -473,11 +277,9 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 473 |
.index
|
| 474 |
)
|
| 475 |
|
| 476 |
-
# Filter data hanya untuk top 10 mata kuliah
|
| 477 |
df_filtered = df[df.set_index(
|
| 478 |
['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
|
| 479 |
|
| 480 |
-
# Transform data dari wide ke long format
|
| 481 |
df_long = df_filtered.melt(
|
| 482 |
id_vars=['kode_matakuliah', 'nama_matakuliah'],
|
| 483 |
value_vars=aspek_columns,
|
|
@@ -485,36 +287,29 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 485 |
value_name='sentimen'
|
| 486 |
)
|
| 487 |
|
| 488 |
-
# Buat label gabungan: "kode - nama"
|
| 489 |
df_long['label'] = (
|
| 490 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 491 |
)
|
| 492 |
|
| 493 |
-
# Group by label dan sentimen, hitung frekuensi
|
| 494 |
matkul_sentiment = (
|
| 495 |
df_long.groupby(['label', 'sentimen'], observed=False)
|
| 496 |
.size()
|
| 497 |
.reset_index(name='jumlah')
|
| 498 |
)
|
| 499 |
|
| 500 |
-
# Hitung total per label untuk sorting
|
| 501 |
total_per_label = (
|
| 502 |
matkul_sentiment.groupby('label')['jumlah']
|
| 503 |
.sum()
|
| 504 |
.sort_values(ascending=False)
|
| 505 |
)
|
| 506 |
-
|
| 507 |
-
# Reverse order untuk horizontal bar (terbanyak di atas)
|
| 508 |
ordered_labels = total_per_label.index.tolist()[::-1]
|
| 509 |
|
| 510 |
-
# Konversi ke categorical untuk maintain order
|
| 511 |
matkul_sentiment['label'] = pd.Categorical(
|
| 512 |
matkul_sentiment['label'],
|
| 513 |
categories=ordered_labels,
|
| 514 |
ordered=True
|
| 515 |
)
|
| 516 |
|
| 517 |
-
# Buat horizontal grouped bar chart
|
| 518 |
fig = px.bar(
|
| 519 |
matkul_sentiment,
|
| 520 |
y='label',
|
|
@@ -526,11 +321,48 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 526 |
)
|
| 527 |
fig.update_layout(
|
| 528 |
title="Distribusi Sentimen pada Top 10 Mata Kuliah",
|
| 529 |
-
yaxis={
|
| 530 |
-
'categoryorder': 'array',
|
| 531 |
-
'categoryarray': ordered_labels
|
| 532 |
-
}
|
| 533 |
)
|
| 534 |
-
|
| 535 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 536 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
+
# Palet warna kustom
|
| 17 |
sentimen_palette = {
|
| 18 |
+
"netral": "#FFE24C",
|
| 19 |
+
"positif": "#4CFF72",
|
| 20 |
+
"negatif": "#FF4C4C"
|
| 21 |
}
|
|
|
|
|
|
|
| 22 |
category_order = ["netral", "positif", "negatif"]
|
| 23 |
|
| 24 |
+
# Konfigurasi Plotly
|
| 25 |
config_options = {
|
| 26 |
+
"scrollZoom": False,
|
| 27 |
+
"displayModeBar": False
|
| 28 |
}
|
| 29 |
|
| 30 |
|
| 31 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 32 |
+
"""Menampilkan bar chart distribusi sentimen per aspek."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 34 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 35 |
return
|
| 36 |
|
|
|
|
| 37 |
df_long = df_predicted.melt(
|
| 38 |
value_vars=aspek_columns,
|
| 39 |
var_name="aspek",
|
| 40 |
value_name="sentimen"
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
df_long["sentimen"] = pd.Categorical(
|
| 43 |
df_long["sentimen"],
|
| 44 |
categories=category_order,
|
| 45 |
ordered=True
|
| 46 |
)
|
|
|
|
|
|
|
| 47 |
count_data = df_long.groupby(
|
| 48 |
["aspek", "sentimen"], observed=False
|
| 49 |
).size().reset_index(name="jumlah")
|
|
|
|
|
|
|
| 50 |
fig = px.bar(
|
| 51 |
count_data,
|
| 52 |
x="aspek",
|
| 53 |
y="jumlah",
|
| 54 |
color="sentimen",
|
| 55 |
+
barmode="group",
|
| 56 |
color_discrete_map=sentimen_palette,
|
| 57 |
category_orders={"sentimen": category_order}
|
| 58 |
)
|
| 59 |
fig.update_layout(title="Distribusi Sentimen per Aspek")
|
|
|
|
|
|
|
| 60 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 61 |
|
| 62 |
|
| 63 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 64 |
+
"""Menampilkan pie chart distribusi total sentimen."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
|
|
|
|
|
|
| 66 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 67 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 68 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
| 69 |
+
fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
|
| 70 |
+
color="sentimen", color_discrete_map=sentimen_palette,
|
| 71 |
+
hole=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
fig.update_layout(title="Total Komposisi Sentimen")
|
|
|
|
|
|
|
| 73 |
fig.update_traces(textposition='inside', textinfo='percent+label')
|
|
|
|
| 74 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 75 |
|
| 76 |
|
| 77 |
def show_year_distribution(df):
|
| 78 |
+
"""Menampilkan distribusi jumlah kritik/saran per tahun."""
|
| 79 |
+
# Coba ekstrak dari kolom tanggal jika ada
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 81 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 82 |
|
|
|
|
| 83 |
if 'tahun' not in df.columns:
|
| 84 |
+
return None # Return None jika tidak ada kolom tahun
|
| 85 |
|
|
|
|
| 86 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 87 |
if df_tahun.empty:
|
| 88 |
return None
|
| 89 |
|
|
|
|
| 90 |
df_tahun['tahun'] = df_tahun['tahun'].astype(int)
|
|
|
|
|
|
|
| 91 |
year_counts = df_tahun['tahun'].value_counts().reset_index()
|
| 92 |
year_counts.columns = ['tahun', 'jumlah']
|
| 93 |
year_counts = year_counts.sort_values('jumlah', ascending=False)
|
| 94 |
|
| 95 |
+
fig = px.bar(year_counts, x='tahun', y='jumlah',
|
| 96 |
+
color='tahun', title="Distribusi Kritik/Saran per Tahun")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
fig.update_layout(xaxis=dict(type='category'))
|
|
|
|
| 98 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 99 |
return True
|
| 100 |
|
| 101 |
|
| 102 |
def show_semester_distribution(df):
|
| 103 |
+
"""Menampilkan distribusi jumlah kritik/saran per semester."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if 'semester' not in df.columns:
|
| 105 |
return None
|
| 106 |
|
|
|
|
| 107 |
semester_counts = df['semester'].value_counts().reset_index()
|
| 108 |
semester_counts.columns = ['semester', 'jumlah']
|
| 109 |
semester_counts = semester_counts.sort_values('jumlah', ascending=False)
|
| 110 |
+
fig = px.bar(semester_counts, x='semester', y='jumlah',
|
| 111 |
+
color='semester', title="Distribusi Kritik/Saran per Semester")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
fig.update_layout(xaxis=dict(categoryorder='total descending'))
|
|
|
|
| 113 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 114 |
return True
|
| 115 |
|
| 116 |
|
| 117 |
def show_prodi_distribution(df):
|
| 118 |
+
"""Menampilkan jumlah kritik/saran per program studi."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
if 'nama_prodi' not in df.columns:
|
| 120 |
return None
|
| 121 |
|
|
|
|
| 122 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 123 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
|
|
|
|
|
|
| 124 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
|
|
|
|
|
|
| 125 |
fig = px.bar(
|
| 126 |
prodi_counts,
|
| 127 |
x='jumlah',
|
| 128 |
y='nama_prodi',
|
| 129 |
+
orientation='h',
|
| 130 |
color='jumlah',
|
| 131 |
title="Jumlah Kritik/Saran per Program Studi"
|
| 132 |
)
|
|
|
|
| 133 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 134 |
return True
|
| 135 |
|
| 136 |
|
| 137 |
def show_top10_matkul_distribution(df):
|
| 138 |
+
"""Menampilkan 10 mata kuliah dengan jumlah kritik/saran terbanyak."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
required_cols = ['nama_matakuliah', 'kode_matakuliah']
|
| 140 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 141 |
|
| 142 |
if missing_cols:
|
| 143 |
return None
|
| 144 |
|
|
|
|
| 145 |
matkul_counts = (
|
| 146 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 147 |
.size()
|
|
|
|
| 149 |
.sort_values(by='jumlah', ascending=False)
|
| 150 |
.head(10)
|
| 151 |
)
|
|
|
|
|
|
|
| 152 |
matkul_counts['label'] = (
|
| 153 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 154 |
matkul_counts['nama_matakuliah']
|
| 155 |
)
|
|
|
|
|
|
|
| 156 |
matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
|
| 157 |
|
|
|
|
| 158 |
fig = px.bar(
|
| 159 |
matkul_counts,
|
| 160 |
x='jumlah',
|
|
|
|
| 163 |
title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
|
| 164 |
color='jumlah'
|
| 165 |
)
|
|
|
|
| 166 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 167 |
return True
|
| 168 |
|
| 169 |
|
| 170 |
def show_sentiment_by_year(df, aspek_columns):
|
| 171 |
+
"""Menampilkan distribusi sentimen per tahun."""
|
| 172 |
+
# Coba ekstrak dari kolom tanggal jika ada
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 174 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 175 |
|
|
|
|
| 176 |
if 'tahun' not in df.columns:
|
| 177 |
return None
|
| 178 |
|
| 179 |
+
df_long = df.melt(id_vars=['tahun'],
|
| 180 |
+
value_vars=aspek_columns,
|
| 181 |
+
var_name='aspek',
|
| 182 |
+
value_name='sentimen')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
year_sentiment = df_long.groupby(
|
| 184 |
['tahun', 'sentimen'], observed=False
|
| 185 |
).size().reset_index(name='jumlah')
|
|
|
|
| 186 |
year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
|
| 187 |
+
fig = px.bar(year_sentiment, x='tahun', y='jumlah', color='sentimen',
|
| 188 |
+
barmode='group', color_discrete_map=sentimen_palette)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
|
|
|
|
| 190 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 191 |
return True
|
| 192 |
|
| 193 |
|
| 194 |
def show_sentiment_by_semester(df, aspek_columns):
|
| 195 |
+
"""Menampilkan distribusi sentimen per semester."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
if 'semester' not in df.columns:
|
| 197 |
return None
|
| 198 |
|
| 199 |
+
df_long = df.melt(id_vars=['semester'],
|
| 200 |
+
value_vars=aspek_columns,
|
| 201 |
+
var_name='aspek',
|
| 202 |
+
value_name='sentimen')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
semester_sentiment = df_long.groupby(
|
| 204 |
['semester', 'sentimen'], observed=False
|
| 205 |
).size().reset_index(name='jumlah')
|
|
|
|
| 206 |
semester_sentiment = semester_sentiment.sort_values(
|
| 207 |
'jumlah', ascending=False)
|
| 208 |
+
fig = px.bar(semester_sentiment, x='semester', y='jumlah', color='sentimen',
|
| 209 |
+
barmode='group', color_discrete_map=sentimen_palette)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
|
|
|
|
| 211 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 212 |
return True
|
| 213 |
|
| 214 |
|
| 215 |
def show_sentiment_by_prodi(df, aspek_columns):
|
| 216 |
+
"""Menampilkan distribusi sentimen per program studi."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if 'nama_prodi' not in df.columns:
|
| 218 |
return None
|
| 219 |
|
|
|
|
| 220 |
df_long = df.melt(
|
| 221 |
id_vars=['nama_prodi'],
|
| 222 |
value_vars=aspek_columns,
|
|
|
|
| 224 |
value_name='sentimen'
|
| 225 |
)
|
| 226 |
|
|
|
|
| 227 |
prodi_sentiment = (
|
| 228 |
df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
|
| 229 |
.size()
|
| 230 |
.reset_index(name='jumlah')
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
total_per_prodi = (
|
| 234 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 235 |
.sum()
|
| 236 |
.sort_values(ascending=False)
|
| 237 |
)
|
|
|
|
|
|
|
| 238 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 239 |
|
|
|
|
| 240 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 241 |
prodi_sentiment['nama_prodi'],
|
| 242 |
categories=ordered_categories,
|
| 243 |
ordered=True
|
| 244 |
)
|
| 245 |
|
|
|
|
| 246 |
fig = px.bar(
|
| 247 |
prodi_sentiment,
|
| 248 |
y='nama_prodi',
|
| 249 |
x='jumlah',
|
| 250 |
color='sentimen',
|
| 251 |
barmode='group',
|
| 252 |
+
orientation='h',
|
| 253 |
color_discrete_map=sentimen_palette
|
| 254 |
)
|
| 255 |
fig.update_layout(
|
| 256 |
title="Distribusi Sentimen per Program Studi",
|
| 257 |
+
yaxis={'categoryorder': 'array',
|
| 258 |
+
'categoryarray': ordered_categories}
|
|
|
|
|
|
|
| 259 |
)
|
|
|
|
| 260 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 261 |
return True
|
| 262 |
|
| 263 |
|
| 264 |
def show_sentiment_by_top10_matkul(df, aspek_columns):
|
| 265 |
+
"""Menampilkan distribusi sentimen pada 10 mata kuliah teratas."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
required_cols = ['kode_matakuliah', 'nama_matakuliah']
|
| 267 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 268 |
|
| 269 |
if missing_cols:
|
| 270 |
return None
|
| 271 |
|
|
|
|
| 272 |
df_top10 = (
|
| 273 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 274 |
.size()
|
|
|
|
| 277 |
.index
|
| 278 |
)
|
| 279 |
|
|
|
|
| 280 |
df_filtered = df[df.set_index(
|
| 281 |
['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
|
| 282 |
|
|
|
|
| 283 |
df_long = df_filtered.melt(
|
| 284 |
id_vars=['kode_matakuliah', 'nama_matakuliah'],
|
| 285 |
value_vars=aspek_columns,
|
|
|
|
| 287 |
value_name='sentimen'
|
| 288 |
)
|
| 289 |
|
|
|
|
| 290 |
df_long['label'] = (
|
| 291 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 292 |
)
|
| 293 |
|
|
|
|
| 294 |
matkul_sentiment = (
|
| 295 |
df_long.groupby(['label', 'sentimen'], observed=False)
|
| 296 |
.size()
|
| 297 |
.reset_index(name='jumlah')
|
| 298 |
)
|
| 299 |
|
|
|
|
| 300 |
total_per_label = (
|
| 301 |
matkul_sentiment.groupby('label')['jumlah']
|
| 302 |
.sum()
|
| 303 |
.sort_values(ascending=False)
|
| 304 |
)
|
|
|
|
|
|
|
| 305 |
ordered_labels = total_per_label.index.tolist()[::-1]
|
| 306 |
|
|
|
|
| 307 |
matkul_sentiment['label'] = pd.Categorical(
|
| 308 |
matkul_sentiment['label'],
|
| 309 |
categories=ordered_labels,
|
| 310 |
ordered=True
|
| 311 |
)
|
| 312 |
|
|
|
|
| 313 |
fig = px.bar(
|
| 314 |
matkul_sentiment,
|
| 315 |
y='label',
|
|
|
|
| 321 |
)
|
| 322 |
fig.update_layout(
|
| 323 |
title="Distribusi Sentimen pada Top 10 Mata Kuliah",
|
| 324 |
+
yaxis={'categoryorder': 'array', 'categoryarray': ordered_labels}
|
|
|
|
|
|
|
|
|
|
| 325 |
)
|
|
|
|
| 326 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 327 |
return True
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def show_sentiment_stacked_percentage(df, aspek_columns):
|
| 331 |
+
"""Menampilkan stacked bar chart dengan persentase sentimen per aspek."""
|
| 332 |
+
|
| 333 |
+
if df.empty or not set(aspek_columns).issubset(df.columns):
|
| 334 |
+
st.warning("Data atau kolom aspek tidak tersedia.")
|
| 335 |
+
return
|
| 336 |
+
|
| 337 |
+
df_long = df.melt(
|
| 338 |
+
value_vars=aspek_columns,
|
| 339 |
+
var_name="aspek",
|
| 340 |
+
value_name="sentimen"
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
# Hitung persentase
|
| 344 |
+
count_data = df_long.groupby(
|
| 345 |
+
['aspek', 'sentimen']).size().reset_index(name='jumlah')
|
| 346 |
+
total_per_aspek = count_data.groupby('aspek')['jumlah'].sum().reset_index()
|
| 347 |
+
total_per_aspek.columns = ['aspek', 'total']
|
| 348 |
+
count_data = count_data.merge(total_per_aspek, on='aspek')
|
| 349 |
+
count_data['persentase'] = (
|
| 350 |
+
count_data['jumlah'] / count_data['total']) * 100
|
| 351 |
+
|
| 352 |
+
fig = px.bar(
|
| 353 |
+
count_data,
|
| 354 |
+
x="aspek",
|
| 355 |
+
y="persentase",
|
| 356 |
+
color="sentimen",
|
| 357 |
+
title="Persentase Distribusi Sentimen per Aspek",
|
| 358 |
+
color_discrete_map=sentimen_palette,
|
| 359 |
+
category_orders={
|
| 360 |
+
"sentimen": category_order,
|
| 361 |
+
"aspek": aspek_columns
|
| 362 |
+
}
|
| 363 |
+
)
|
| 364 |
+
fig.update_layout(
|
| 365 |
+
yaxis_title="Persentase (%)",
|
| 366 |
+
xaxis_title="Aspek"
|
| 367 |
+
)
|
| 368 |
+
st.plotly_chart(fig, use_container_width=True, config=config_options)
|