Spaces:
Sleeping
Sleeping
merapikan file, menambahkan komentar penting pada config.py, preprocessing.py, visualization.py, dan app.py
Browse files- app.py +288 -197
- config.py +0 -1
- preprocessing.py +66 -51
- visualization.py +254 -86
app.py
CHANGED
|
@@ -6,6 +6,8 @@ berbasis aspek dari kritik dan saran mahasiswa.
|
|
| 6 |
UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
|
| 7 |
UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
|
| 8 |
"""
|
|
|
|
|
|
|
| 9 |
import os
|
| 10 |
import time
|
| 11 |
import gc
|
|
@@ -36,41 +38,48 @@ from visualization import (
|
|
| 36 |
from preprocessing import text_preprocessing_pipeline
|
| 37 |
|
| 38 |
# Konfigurasi untuk chunked processing
|
| 39 |
-
CHUNK_SIZE = 2500
|
| 40 |
-
ENABLE_CHUNKED = True
|
| 41 |
-
CACHE_EXPIRY_HOURS = 24
|
| 42 |
|
|
|
|
| 43 |
os.makedirs("chache_file", exist_ok=True)
|
| 44 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 45 |
|
| 46 |
-
# Konfigurasi halaman
|
| 47 |
st.set_page_config(
|
| 48 |
page_title="ABSA IndoBERT",
|
| 49 |
layout="wide",
|
| 50 |
page_icon="💬"
|
| 51 |
)
|
| 52 |
|
| 53 |
-
# Load custom CSS
|
| 54 |
with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
|
| 55 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 56 |
st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
|
| 57 |
|
| 58 |
|
| 59 |
def get_session_id():
|
| 60 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 61 |
query_params = st.query_params
|
| 62 |
|
|
|
|
| 63 |
if "sid" in query_params:
|
| 64 |
sid = query_params["sid"]
|
| 65 |
st.session_state.session_id = sid
|
| 66 |
return sid
|
| 67 |
|
|
|
|
| 68 |
if "session_id" not in st.session_state:
|
| 69 |
new_session_id = str(uuid.uuid4())
|
| 70 |
st.session_state.session_id = new_session_id
|
| 71 |
st.query_params["sid"] = new_session_id
|
| 72 |
return new_session_id
|
| 73 |
|
|
|
|
| 74 |
existing_id = st.session_state.session_id
|
| 75 |
st.query_params["sid"] = existing_id
|
| 76 |
return existing_id
|
|
@@ -92,7 +101,10 @@ def get_session_chunks_dir():
|
|
| 92 |
|
| 93 |
|
| 94 |
def cleanup_old_sessions():
|
| 95 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 96 |
sessions_dir = Path("chache_file/sessions")
|
| 97 |
if not sessions_dir.exists():
|
| 98 |
return
|
|
@@ -103,6 +115,7 @@ def cleanup_old_sessions():
|
|
| 103 |
mod_time = session_dir.stat().st_mtime
|
| 104 |
age_hours = (current_time - mod_time) / 3600
|
| 105 |
|
|
|
|
| 106 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 107 |
try:
|
| 108 |
shutil.rmtree(session_dir)
|
|
@@ -111,18 +124,24 @@ def cleanup_old_sessions():
|
|
| 111 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 112 |
|
| 113 |
|
|
|
|
| 114 |
cleanup_old_sessions()
|
| 115 |
|
| 116 |
|
| 117 |
@st.cache_resource(show_spinner=False)
|
| 118 |
def get_model_resources():
|
| 119 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 120 |
return load_model_and_tokenizer()
|
| 121 |
|
| 122 |
|
|
|
|
| 123 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 124 |
model, tokenizer, le, device = get_model_resources()
|
| 125 |
|
|
|
|
| 126 |
success_placeholder = st.empty()
|
| 127 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 128 |
time.sleep(1)
|
|
@@ -130,7 +149,7 @@ success_placeholder.empty()
|
|
| 130 |
|
| 131 |
|
| 132 |
def convert_df_to_excel(df):
|
| 133 |
-
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream
|
| 134 |
output = BytesIO()
|
| 135 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 136 |
df.to_excel(writer, index=False)
|
|
@@ -138,7 +157,7 @@ def convert_df_to_excel(df):
|
|
| 138 |
|
| 139 |
|
| 140 |
def clear_memory():
|
| 141 |
-
"""Clear memory cache"""
|
| 142 |
gc.collect()
|
| 143 |
if torch.cuda.is_available():
|
| 144 |
torch.cuda.empty_cache()
|
|
@@ -146,8 +165,20 @@ def clear_memory():
|
|
| 146 |
|
| 147 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 148 |
"""
|
| 149 |
-
Memproses satu chunk data dengan batch processing
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
"""
|
| 152 |
# STEP 1: Preprocessing (0-100%)
|
| 153 |
cleaned_text_list = []
|
|
@@ -157,6 +188,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 157 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 158 |
cleaned_text_list.append(clean_text)
|
| 159 |
|
|
|
|
| 160 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 161 |
progress = (idx + 1) / total_rows
|
| 162 |
progress_bar.progress(progress)
|
|
@@ -174,6 +206,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 174 |
num_sents = len(cleaned_text_list)
|
| 175 |
num_asps = len(ASPEK_COLUMNS)
|
| 176 |
|
|
|
|
| 177 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 178 |
tokenizer, CONFIG["max_len"])
|
| 179 |
dl = DataLoader(
|
|
@@ -183,11 +216,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 183 |
num_workers=0
|
| 184 |
)
|
| 185 |
|
|
|
|
| 186 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 187 |
|
| 188 |
batch_counter = 0
|
| 189 |
total_batch_count = len(dl)
|
| 190 |
|
|
|
|
| 191 |
model.eval()
|
| 192 |
with torch.no_grad():
|
| 193 |
for batch_data in dl:
|
|
@@ -196,15 +231,18 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 196 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 197 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 198 |
|
|
|
|
| 199 |
model_outputs = model(inp_ids, attn_mask)
|
| 200 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 201 |
predicted_indices = torch.argmax(
|
| 202 |
probabilities, dim=1).cpu().numpy()
|
| 203 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 204 |
|
|
|
|
| 205 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 206 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 207 |
|
|
|
|
| 208 |
batch_counter += 1
|
| 209 |
progress = batch_counter / total_batch_count
|
| 210 |
progress_bar.progress(progress)
|
|
@@ -216,12 +254,14 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 216 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 217 |
row_dict = data_row.to_dict()
|
| 218 |
row_dict["kritik_saran"] = cleaned_text_list[idx]
|
|
|
|
| 219 |
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
|
| 220 |
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
|
| 221 |
result_list.append(row_dict)
|
| 222 |
|
| 223 |
result_dataframe = pd.DataFrame(result_list)
|
| 224 |
|
|
|
|
| 225 |
chunks_directory = get_session_chunks_dir()
|
| 226 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 227 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
|
@@ -230,13 +270,17 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 230 |
progress_bar.progress(1.0)
|
| 231 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 232 |
|
|
|
|
| 233 |
clear_memory()
|
| 234 |
|
| 235 |
return result_dataframe
|
| 236 |
|
| 237 |
|
| 238 |
def get_available_columns(df):
|
| 239 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 240 |
available = {
|
| 241 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 242 |
'has_semester': 'semester' in df.columns,
|
|
@@ -246,6 +290,8 @@ def get_available_columns(df):
|
|
| 246 |
return available
|
| 247 |
|
| 248 |
|
|
|
|
|
|
|
| 249 |
# Judul aplikasi
|
| 250 |
st.markdown("""
|
| 251 |
<h1 class='title-center'>ABSA IndoBERT</h1>
|
|
@@ -257,7 +303,7 @@ st.markdown(" ")
|
|
| 257 |
st.markdown(" ")
|
| 258 |
st.markdown(" ")
|
| 259 |
|
| 260 |
-
# Panduan
|
| 261 |
steps = [
|
| 262 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 263 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
@@ -269,6 +315,7 @@ steps = [
|
|
| 269 |
"description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
|
| 270 |
]
|
| 271 |
|
|
|
|
| 272 |
cols = st.columns(len(steps))
|
| 273 |
|
| 274 |
for i, step in enumerate(steps):
|
|
@@ -284,18 +331,19 @@ for i, step in enumerate(steps):
|
|
| 284 |
st.markdown("")
|
| 285 |
st.markdown("")
|
| 286 |
|
| 287 |
-
# Upload file
|
| 288 |
uploaded_file = st.file_uploader(
|
| 289 |
" Upload Data Kritik & Saran",
|
| 290 |
type=["xlsx"],
|
| 291 |
help="File maksimal 200MB dengan format .xlsx"
|
| 292 |
)
|
| 293 |
|
| 294 |
-
#
|
| 295 |
session_cache_dir = get_session_cache_dir()
|
| 296 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 297 |
session_chunks_dir = get_session_chunks_dir()
|
| 298 |
|
|
|
|
| 299 |
if session_result_file.exists():
|
| 300 |
if st.button("Hapus Cache Data"):
|
| 301 |
session_result_file.unlink()
|
|
@@ -303,6 +351,7 @@ if session_result_file.exists():
|
|
| 303 |
time.sleep(1)
|
| 304 |
st.rerun()
|
| 305 |
|
|
|
|
| 306 |
if session_chunks_dir.exists():
|
| 307 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 308 |
if chunk_files:
|
|
@@ -314,6 +363,7 @@ if session_chunks_dir.exists():
|
|
| 314 |
time.sleep(1)
|
| 315 |
st.rerun()
|
| 316 |
|
|
|
|
| 317 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 318 |
if not uploaded_file:
|
| 319 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
@@ -334,13 +384,15 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
|
|
| 334 |
else:
|
| 335 |
st.caption(" ")
|
| 336 |
|
| 337 |
-
|
| 338 |
if "df_predicted" not in st.session_state:
|
| 339 |
st.session_state.df_predicted = None
|
| 340 |
|
|
|
|
| 341 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 342 |
try:
|
| 343 |
df_cached = pd.read_csv(session_result_file)
|
|
|
|
| 344 |
if "tahun" in df_cached.columns:
|
| 345 |
df_cached["tahun"] = pd.to_numeric(
|
| 346 |
df_cached["tahun"], errors='coerce').astype('Int64')
|
|
@@ -350,14 +402,20 @@ if st.session_state.df_predicted is None and session_result_file.exists():
|
|
| 350 |
st.warning(f"Gagal memuat cache: {e}")
|
| 351 |
|
| 352 |
|
|
|
|
| 353 |
if uploaded_file:
|
| 354 |
file_bytes = uploaded_file.getvalue()
|
|
|
|
|
|
|
| 355 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 356 |
st.session_state.last_uploaded_file = file_bytes
|
| 357 |
st.session_state.uploaded_filename = uploaded_file.name
|
|
|
|
| 358 |
try:
|
|
|
|
| 359 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 360 |
|
|
|
|
| 361 |
if "tahun" in df_uploaded.columns:
|
| 362 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 363 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
@@ -365,11 +423,15 @@ if uploaded_file:
|
|
| 365 |
except ValueError as err:
|
| 366 |
st.error(f"Gagal membaca file: {err}")
|
| 367 |
else:
|
|
|
|
| 368 |
if "kritik_saran" not in df_uploaded.columns:
|
| 369 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 370 |
else:
|
|
|
|
| 371 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 372 |
subset=["kritik_saran"])
|
|
|
|
|
|
|
| 373 |
for aspect_col in ASPEK_COLUMNS:
|
| 374 |
if aspect_col not in df_uploaded.columns:
|
| 375 |
df_uploaded[aspect_col] = None
|
|
@@ -379,9 +441,11 @@ if uploaded_file:
|
|
| 379 |
total_rows = len(df_uploaded)
|
| 380 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 381 |
|
|
|
|
| 382 |
if use_chunked:
|
| 383 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 384 |
|
|
|
|
| 385 |
info_col1, info_col2, info_col3 = st.columns(3)
|
| 386 |
with info_col1:
|
| 387 |
st.info(f"**Total data:** {total_rows:,} rows")
|
|
@@ -398,6 +462,7 @@ if uploaded_file:
|
|
| 398 |
chunk_status_text = st.empty()
|
| 399 |
overall_status = st.empty()
|
| 400 |
|
|
|
|
| 401 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 402 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 403 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
@@ -406,6 +471,7 @@ if uploaded_file:
|
|
| 406 |
current_chunk_file = session_chunks_dir / \
|
| 407 |
f"chunk_{current_chunk_number}.csv"
|
| 408 |
|
|
|
|
| 409 |
if current_chunk_file.exists():
|
| 410 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 411 |
all_chunk_results.append(chunk_result)
|
|
@@ -424,6 +490,7 @@ if uploaded_file:
|
|
| 424 |
time.sleep(0.3)
|
| 425 |
continue
|
| 426 |
|
|
|
|
| 427 |
chunk_progress_bar.progress(0)
|
| 428 |
|
| 429 |
chunk_result = process_chunk_batch(
|
|
@@ -432,6 +499,7 @@ if uploaded_file:
|
|
| 432 |
)
|
| 433 |
all_chunk_results.append(chunk_result)
|
| 434 |
|
|
|
|
| 435 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 436 |
progress_pct = (processed / total_rows) * 100
|
| 437 |
elapsed = time.time() - start_time
|
|
@@ -446,6 +514,7 @@ if uploaded_file:
|
|
| 446 |
|
| 447 |
time.sleep(0.3)
|
| 448 |
|
|
|
|
| 449 |
chunk_status_text.empty()
|
| 450 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 451 |
df_session = pd.concat(
|
|
@@ -455,6 +524,7 @@ if uploaded_file:
|
|
| 455 |
end_time = time.time()
|
| 456 |
duration = end_time - start_time
|
| 457 |
|
|
|
|
| 458 |
else:
|
| 459 |
st.info(
|
| 460 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
|
@@ -464,6 +534,7 @@ if uploaded_file:
|
|
| 464 |
progress_bar = st.progress(0)
|
| 465 |
status_text = st.empty()
|
| 466 |
|
|
|
|
| 467 |
cleaned_text_list = []
|
| 468 |
total_preprocessing = len(df_uploaded)
|
| 469 |
|
|
@@ -481,6 +552,7 @@ if uploaded_file:
|
|
| 481 |
status_text.text("Memulai prediksi...")
|
| 482 |
time.sleep(0.3)
|
| 483 |
|
|
|
|
| 484 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 485 |
num_sents = len(cleaned_text_list)
|
| 486 |
num_asps = len(ASPEK_COLUMNS)
|
|
@@ -520,6 +592,7 @@ if uploaded_file:
|
|
| 520 |
status_text.text(
|
| 521 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 522 |
|
|
|
|
| 523 |
result_list = []
|
| 524 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 525 |
row_dict = data_row.to_dict()
|
|
@@ -539,16 +612,20 @@ if uploaded_file:
|
|
| 539 |
end_time = time.time()
|
| 540 |
duration = end_time - start_time
|
| 541 |
|
|
|
|
| 542 |
st.session_state.df_predicted = df_session
|
| 543 |
df_session.to_csv(session_result_file, index=False)
|
| 544 |
|
|
|
|
| 545 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 546 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 547 |
f.write(uploaded_file.name)
|
| 548 |
|
|
|
|
| 549 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 550 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 551 |
|
|
|
|
| 552 |
if use_chunked:
|
| 553 |
st.success(
|
| 554 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
@@ -568,14 +645,14 @@ if uploaded_file:
|
|
| 568 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 569 |
)
|
| 570 |
|
| 571 |
-
#
|
| 572 |
if st.session_state.df_predicted is not None:
|
| 573 |
df_predicted = st.session_state.df_predicted
|
| 574 |
|
| 575 |
-
# Deteksi kolom yang tersedia
|
| 576 |
available_cols = get_available_columns(df_predicted)
|
| 577 |
|
| 578 |
-
#
|
| 579 |
st.sidebar.header("Filter Data")
|
| 580 |
|
| 581 |
df_clean = df_predicted.copy()
|
|
@@ -587,7 +664,7 @@ if st.session_state.df_predicted is not None:
|
|
| 587 |
st.sidebar.info(
|
| 588 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 589 |
|
| 590 |
-
# Filter Mata Kuliah
|
| 591 |
selected_matkul = []
|
| 592 |
if available_cols['has_matkul']:
|
| 593 |
matkul_options = sorted(
|
|
@@ -596,7 +673,7 @@ if st.session_state.df_predicted is not None:
|
|
| 596 |
selected_matkul = st.sidebar.multiselect(
|
| 597 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 598 |
|
| 599 |
-
# Filter Program Studi
|
| 600 |
selected_prodi = []
|
| 601 |
if available_cols['has_prodi']:
|
| 602 |
prodi_options = sorted(
|
|
@@ -605,9 +682,10 @@ if st.session_state.df_predicted is not None:
|
|
| 605 |
selected_prodi = st.sidebar.multiselect(
|
| 606 |
"Program Studi", prodi_options, default=prodi_options)
|
| 607 |
|
| 608 |
-
# Filter Tahun
|
| 609 |
selected_tahun = []
|
| 610 |
if available_cols['has_tahun']:
|
|
|
|
| 611 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
| 612 |
df_clean['tahun'] = pd.to_datetime(
|
| 613 |
df_clean['tanggal'], errors='coerce').dt.year
|
|
@@ -618,7 +696,7 @@ if st.session_state.df_predicted is not None:
|
|
| 618 |
selected_tahun = st.sidebar.multiselect(
|
| 619 |
"Tahun", tahun_options, default=tahun_options)
|
| 620 |
|
| 621 |
-
# Filter Semester
|
| 622 |
selected_semester = []
|
| 623 |
if available_cols['has_semester']:
|
| 624 |
semester_options = sorted(
|
|
@@ -627,7 +705,7 @@ if st.session_state.df_predicted is not None:
|
|
| 627 |
selected_semester = st.sidebar.multiselect(
|
| 628 |
"Semester", semester_options, default=semester_options)
|
| 629 |
|
| 630 |
-
# Apply
|
| 631 |
df_filtered = df_clean.copy()
|
| 632 |
|
| 633 |
if selected_matkul and available_cols['has_matkul']:
|
|
@@ -636,123 +714,130 @@ if st.session_state.df_predicted is not None:
|
|
| 636 |
|
| 637 |
if selected_prodi and available_cols['has_prodi']:
|
| 638 |
df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
|
| 639 |
-
selected_prodi
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
# Ulasan & Aspek (selalu ada)
|
| 706 |
-
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
|
| 707 |
-
col_idx += 1
|
| 708 |
-
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 709 |
-
col_idx += 1
|
| 710 |
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
|
| 715 |
-
col_idx += 1
|
| 716 |
|
| 717 |
-
|
| 718 |
-
if available_cols['has_prodi']:
|
| 719 |
-
prodi_count = df_filtered['nama_prodi'].nunique()
|
| 720 |
-
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 721 |
-
col_idx += 1
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
|
| 729 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
|
| 731 |
-
|
| 732 |
-
|
|
|
|
|
|
|
|
|
|
| 733 |
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
|
|
|
| 738 |
|
| 739 |
-
|
| 740 |
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
col_idx2 += 1
|
| 744 |
-
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
|
| 745 |
-
col_idx2 += 1
|
| 746 |
-
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
|
| 747 |
-
col_idx2 += 1
|
| 748 |
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
if len(tahun_valid) > 0:
|
| 754 |
-
tahun_min
|
| 755 |
-
tahun_max
|
| 756 |
if tahun_min == tahun_max:
|
| 757 |
cols2[col_idx2].metric("Tahun", f"{tahun_min}")
|
| 758 |
else:
|
|
@@ -760,89 +845,95 @@ if st.session_state.df_predicted is not None:
|
|
| 760 |
"Rentang Tahun", f"{tahun_min} - {tahun_max}")
|
| 761 |
else:
|
| 762 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 763 |
-
|
| 764 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 765 |
-
|
| 766 |
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
word_counts
|
| 771 |
str).str.split().str.len()
|
| 772 |
-
avg_word_count
|
| 773 |
cols2[col_idx2].metric(
|
| 774 |
"Rata-rata Panjang Kata", f"{avg_word_count} kata")
|
| 775 |
-
|
| 776 |
cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
|
| 777 |
|
| 778 |
-
|
| 779 |
-
|
|
|
|
| 780 |
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
|
| 788 |
-
|
| 789 |
-
|
| 790 |
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
|
|
|
| 794 |
if available_cols['has_tahun']:
|
| 795 |
-
result
|
| 796 |
if result:
|
| 797 |
-
viz_shown
|
| 798 |
-
|
| 799 |
if available_cols['has_semester']:
|
| 800 |
-
result
|
| 801 |
if result:
|
| 802 |
-
viz_shown
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
|
|
|
|
|
|
| 821 |
if available_cols['has_tahun']:
|
| 822 |
-
result
|
| 823 |
if result:
|
| 824 |
-
viz_shown
|
| 825 |
-
|
| 826 |
if available_cols['has_semester']:
|
| 827 |
-
result
|
| 828 |
if result:
|
| 829 |
-
viz_shown
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
|
|
|
|
|
|
| 845 |
<div class='footer'>
|
| 846 |
-
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
| 847 |
</div>
|
| 848 |
""", unsafe_allow_html=True)
|
|
|
|
| 6 |
UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
|
| 7 |
UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
|
| 8 |
"""
|
| 9 |
+
|
| 10 |
+
# Import library yang diperlukan
|
| 11 |
import os
|
| 12 |
import time
|
| 13 |
import gc
|
|
|
|
| 38 |
from preprocessing import text_preprocessing_pipeline
|
| 39 |
|
| 40 |
# Konfigurasi untuk chunked processing
|
| 41 |
+
CHUNK_SIZE = 2500 # Ukuran chunk untuk memproses data besar
|
| 42 |
+
ENABLE_CHUNKED = True # Aktifkan mode chunked processing
|
| 43 |
+
CACHE_EXPIRY_HOURS = 24 # Durasi cache sebelum dihapus otomatis
|
| 44 |
|
| 45 |
+
# Membuat direktori cache jika belum ada
|
| 46 |
os.makedirs("chache_file", exist_ok=True)
|
| 47 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 48 |
|
| 49 |
+
# Konfigurasi halaman Streamlit
|
| 50 |
st.set_page_config(
|
| 51 |
page_title="ABSA IndoBERT",
|
| 52 |
layout="wide",
|
| 53 |
page_icon="💬"
|
| 54 |
)
|
| 55 |
|
| 56 |
+
# Load custom CSS untuk styling
|
| 57 |
with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
|
| 58 |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 59 |
st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
|
| 60 |
|
| 61 |
|
| 62 |
def get_session_id():
|
| 63 |
+
"""
|
| 64 |
+
Generate atau retrieve session ID untuk user - PERSISTENT across refresh
|
| 65 |
+
Menggunakan query params agar session tetap konsisten saat refresh
|
| 66 |
+
"""
|
| 67 |
query_params = st.query_params
|
| 68 |
|
| 69 |
+
# Cek jika sudah ada session ID di URL
|
| 70 |
if "sid" in query_params:
|
| 71 |
sid = query_params["sid"]
|
| 72 |
st.session_state.session_id = sid
|
| 73 |
return sid
|
| 74 |
|
| 75 |
+
# Buat session ID baru jika belum ada
|
| 76 |
if "session_id" not in st.session_state:
|
| 77 |
new_session_id = str(uuid.uuid4())
|
| 78 |
st.session_state.session_id = new_session_id
|
| 79 |
st.query_params["sid"] = new_session_id
|
| 80 |
return new_session_id
|
| 81 |
|
| 82 |
+
# Gunakan session ID yang sudah ada
|
| 83 |
existing_id = st.session_state.session_id
|
| 84 |
st.query_params["sid"] = existing_id
|
| 85 |
return existing_id
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def cleanup_old_sessions():
|
| 104 |
+
"""
|
| 105 |
+
Hapus session cache yang sudah expired (> 24 jam)
|
| 106 |
+
Membersihkan cache lama untuk menghemat storage
|
| 107 |
+
"""
|
| 108 |
sessions_dir = Path("chache_file/sessions")
|
| 109 |
if not sessions_dir.exists():
|
| 110 |
return
|
|
|
|
| 115 |
mod_time = session_dir.stat().st_mtime
|
| 116 |
age_hours = (current_time - mod_time) / 3600
|
| 117 |
|
| 118 |
+
# Hapus jika lebih dari 24 jam
|
| 119 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 120 |
try:
|
| 121 |
shutil.rmtree(session_dir)
|
|
|
|
| 124 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 125 |
|
| 126 |
|
| 127 |
+
# Jalankan cleanup saat aplikasi dimulai
|
| 128 |
cleanup_old_sessions()
|
| 129 |
|
| 130 |
|
| 131 |
@st.cache_resource(show_spinner=False)
|
| 132 |
def get_model_resources():
|
| 133 |
+
"""
|
| 134 |
+
Memuat model dan tokenizer IndoBERT
|
| 135 |
+
Menggunakan cache agar model tidak dimuat ulang setiap kali
|
| 136 |
+
"""
|
| 137 |
return load_model_and_tokenizer()
|
| 138 |
|
| 139 |
|
| 140 |
+
# Load model dengan spinner
|
| 141 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 142 |
model, tokenizer, le, device = get_model_resources()
|
| 143 |
|
| 144 |
+
# Tampilkan notifikasi sukses sementara
|
| 145 |
success_placeholder = st.empty()
|
| 146 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 147 |
time.sleep(1)
|
|
|
|
| 149 |
|
| 150 |
|
| 151 |
def convert_df_to_excel(df):
|
| 152 |
+
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
|
| 153 |
output = BytesIO()
|
| 154 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 155 |
df.to_excel(writer, index=False)
|
|
|
|
| 157 |
|
| 158 |
|
| 159 |
def clear_memory():
|
| 160 |
+
"""Clear memory cache untuk menghemat RAM dan VRAM"""
|
| 161 |
gc.collect()
|
| 162 |
if torch.cuda.is_available():
|
| 163 |
torch.cuda.empty_cache()
|
|
|
|
| 165 |
|
| 166 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 167 |
"""
|
| 168 |
+
Memproses satu chunk data dengan batch processing
|
| 169 |
+
STEP 1: Preprocessing teks (cleaning, normalisasi)
|
| 170 |
+
STEP 2: Batch Prediction menggunakan model IndoBERT
|
| 171 |
+
STEP 3: Combine results dan simpan ke file CSV
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
chunk_dataframe: Data chunk yang akan diproses
|
| 175 |
+
chunk_num: Nomor chunk saat ini
|
| 176 |
+
total_chunk_count: Total jumlah chunk
|
| 177 |
+
progress_bar: Progress bar Streamlit
|
| 178 |
+
status_text: Text status Streamlit
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
result_dataframe: DataFrame hasil prediksi untuk chunk ini
|
| 182 |
"""
|
| 183 |
# STEP 1: Preprocessing (0-100%)
|
| 184 |
cleaned_text_list = []
|
|
|
|
| 188 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 189 |
cleaned_text_list.append(clean_text)
|
| 190 |
|
| 191 |
+
# Update progress bar setiap 50 baris
|
| 192 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 193 |
progress = (idx + 1) / total_rows
|
| 194 |
progress_bar.progress(progress)
|
|
|
|
| 206 |
num_sents = len(cleaned_text_list)
|
| 207 |
num_asps = len(ASPEK_COLUMNS)
|
| 208 |
|
| 209 |
+
# Buat dataset dan dataloader
|
| 210 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 211 |
tokenizer, CONFIG["max_len"])
|
| 212 |
dl = DataLoader(
|
|
|
|
| 216 |
num_workers=0
|
| 217 |
)
|
| 218 |
|
| 219 |
+
# Matrix untuk menyimpan hasil prediksi
|
| 220 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 221 |
|
| 222 |
batch_counter = 0
|
| 223 |
total_batch_count = len(dl)
|
| 224 |
|
| 225 |
+
# Lakukan prediksi batch demi batch
|
| 226 |
model.eval()
|
| 227 |
with torch.no_grad():
|
| 228 |
for batch_data in dl:
|
|
|
|
| 231 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 232 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 233 |
|
| 234 |
+
# Forward pass model
|
| 235 |
model_outputs = model(inp_ids, attn_mask)
|
| 236 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 237 |
predicted_indices = torch.argmax(
|
| 238 |
probabilities, dim=1).cpu().numpy()
|
| 239 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 240 |
|
| 241 |
+
# Simpan hasil prediksi ke matrix
|
| 242 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 243 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 244 |
|
| 245 |
+
# Update progress bar
|
| 246 |
batch_counter += 1
|
| 247 |
progress = batch_counter / total_batch_count
|
| 248 |
progress_bar.progress(progress)
|
|
|
|
| 254 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 255 |
row_dict = data_row.to_dict()
|
| 256 |
row_dict["kritik_saran"] = cleaned_text_list[idx]
|
| 257 |
+
# Tambahkan hasil prediksi untuk setiap aspek
|
| 258 |
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
|
| 259 |
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
|
| 260 |
result_list.append(row_dict)
|
| 261 |
|
| 262 |
result_dataframe = pd.DataFrame(result_list)
|
| 263 |
|
| 264 |
+
# Simpan chunk ke file CSV
|
| 265 |
chunks_directory = get_session_chunks_dir()
|
| 266 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 267 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
|
|
|
| 270 |
progress_bar.progress(1.0)
|
| 271 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 272 |
|
| 273 |
+
# Bersihkan memory
|
| 274 |
clear_memory()
|
| 275 |
|
| 276 |
return result_dataframe
|
| 277 |
|
| 278 |
|
| 279 |
def get_available_columns(df):
|
| 280 |
+
"""
|
| 281 |
+
Deteksi kolom-kolom yang tersedia dalam dataframe
|
| 282 |
+
Untuk menentukan visualisasi mana yang bisa ditampilkan
|
| 283 |
+
"""
|
| 284 |
available = {
|
| 285 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 286 |
'has_semester': 'semester' in df.columns,
|
|
|
|
| 290 |
return available
|
| 291 |
|
| 292 |
|
| 293 |
+
# ================== BAGIAN UI UTAMA ==================
|
| 294 |
+
|
| 295 |
# Judul aplikasi
|
| 296 |
st.markdown("""
|
| 297 |
<h1 class='title-center'>ABSA IndoBERT</h1>
|
|
|
|
| 303 |
st.markdown(" ")
|
| 304 |
st.markdown(" ")
|
| 305 |
|
| 306 |
+
# Panduan penggunaan aplikasi
|
| 307 |
steps = [
|
| 308 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 309 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
|
|
| 315 |
"description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
|
| 316 |
]
|
| 317 |
|
| 318 |
+
# Tampilkan panduan dalam 4 kolom
|
| 319 |
cols = st.columns(len(steps))
|
| 320 |
|
| 321 |
for i, step in enumerate(steps):
|
|
|
|
| 331 |
st.markdown("")
|
| 332 |
st.markdown("")
|
| 333 |
|
| 334 |
+
# Upload file Excel
|
| 335 |
uploaded_file = st.file_uploader(
|
| 336 |
" Upload Data Kritik & Saran",
|
| 337 |
type=["xlsx"],
|
| 338 |
help="File maksimal 200MB dengan format .xlsx"
|
| 339 |
)
|
| 340 |
|
| 341 |
+
# Tombol untuk menghapus cache (session-specific)
|
| 342 |
session_cache_dir = get_session_cache_dir()
|
| 343 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 344 |
session_chunks_dir = get_session_chunks_dir()
|
| 345 |
|
| 346 |
+
# Tombol hapus cache data utama
|
| 347 |
if session_result_file.exists():
|
| 348 |
if st.button("Hapus Cache Data"):
|
| 349 |
session_result_file.unlink()
|
|
|
|
| 351 |
time.sleep(1)
|
| 352 |
st.rerun()
|
| 353 |
|
| 354 |
+
# Tombol hapus cache chunks
|
| 355 |
if session_chunks_dir.exists():
|
| 356 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 357 |
if chunk_files:
|
|
|
|
| 363 |
time.sleep(1)
|
| 364 |
st.rerun()
|
| 365 |
|
| 366 |
+
# Tampilkan info file yang di-cache
|
| 367 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 368 |
if not uploaded_file:
|
| 369 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
|
|
| 384 |
else:
|
| 385 |
st.caption(" ")
|
| 386 |
|
| 387 |
+
# Inisialisasi session state untuk hasil prediksi
|
| 388 |
if "df_predicted" not in st.session_state:
|
| 389 |
st.session_state.df_predicted = None
|
| 390 |
|
| 391 |
+
# Load cache jika ada
|
| 392 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 393 |
try:
|
| 394 |
df_cached = pd.read_csv(session_result_file)
|
| 395 |
+
# Konversi kolom tahun ke format yang benar
|
| 396 |
if "tahun" in df_cached.columns:
|
| 397 |
df_cached["tahun"] = pd.to_numeric(
|
| 398 |
df_cached["tahun"], errors='coerce').astype('Int64')
|
|
|
|
| 402 |
st.warning(f"Gagal memuat cache: {e}")
|
| 403 |
|
| 404 |
|
| 405 |
+
# ================== PROSES UPLOAD & PREDIKSI ==================
|
| 406 |
if uploaded_file:
|
| 407 |
file_bytes = uploaded_file.getvalue()
|
| 408 |
+
|
| 409 |
+
# Cek apakah file baru atau sama dengan sebelumnya
|
| 410 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 411 |
st.session_state.last_uploaded_file = file_bytes
|
| 412 |
st.session_state.uploaded_filename = uploaded_file.name
|
| 413 |
+
|
| 414 |
try:
|
| 415 |
+
# Baca file Excel
|
| 416 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 417 |
|
| 418 |
+
# Konversi kolom tahun jika ada
|
| 419 |
if "tahun" in df_uploaded.columns:
|
| 420 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 421 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
|
|
| 423 |
except ValueError as err:
|
| 424 |
st.error(f"Gagal membaca file: {err}")
|
| 425 |
else:
|
| 426 |
+
# Validasi kolom wajib
|
| 427 |
if "kritik_saran" not in df_uploaded.columns:
|
| 428 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 429 |
else:
|
| 430 |
+
# Hapus duplikat berdasarkan kolom kritik_saran
|
| 431 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 432 |
subset=["kritik_saran"])
|
| 433 |
+
|
| 434 |
+
# Tambahkan kolom aspek jika belum ada
|
| 435 |
for aspect_col in ASPEK_COLUMNS:
|
| 436 |
if aspect_col not in df_uploaded.columns:
|
| 437 |
df_uploaded[aspect_col] = None
|
|
|
|
| 441 |
total_rows = len(df_uploaded)
|
| 442 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 443 |
|
| 444 |
+
# ============ MODE CHUNKED PROCESSING ============
|
| 445 |
if use_chunked:
|
| 446 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 447 |
|
| 448 |
+
# Tampilkan info processing
|
| 449 |
info_col1, info_col2, info_col3 = st.columns(3)
|
| 450 |
with info_col1:
|
| 451 |
st.info(f"**Total data:** {total_rows:,} rows")
|
|
|
|
| 462 |
chunk_status_text = st.empty()
|
| 463 |
overall_status = st.empty()
|
| 464 |
|
| 465 |
+
# Proses setiap chunk
|
| 466 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 467 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 468 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
|
|
| 471 |
current_chunk_file = session_chunks_dir / \
|
| 472 |
f"chunk_{current_chunk_number}.csv"
|
| 473 |
|
| 474 |
+
# Cek apakah chunk sudah pernah diproses (ada di cache)
|
| 475 |
if current_chunk_file.exists():
|
| 476 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 477 |
all_chunk_results.append(chunk_result)
|
|
|
|
| 490 |
time.sleep(0.3)
|
| 491 |
continue
|
| 492 |
|
| 493 |
+
# Proses chunk baru
|
| 494 |
chunk_progress_bar.progress(0)
|
| 495 |
|
| 496 |
chunk_result = process_chunk_batch(
|
|
|
|
| 499 |
)
|
| 500 |
all_chunk_results.append(chunk_result)
|
| 501 |
|
| 502 |
+
# Hitung estimasi waktu
|
| 503 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 504 |
progress_pct = (processed / total_rows) * 100
|
| 505 |
elapsed = time.time() - start_time
|
|
|
|
| 514 |
|
| 515 |
time.sleep(0.3)
|
| 516 |
|
| 517 |
+
# Gabungkan semua chunk
|
| 518 |
chunk_status_text.empty()
|
| 519 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 520 |
df_session = pd.concat(
|
|
|
|
| 524 |
end_time = time.time()
|
| 525 |
duration = end_time - start_time
|
| 526 |
|
| 527 |
+
# ============ MODE BATCH PROCESSING (tanpa chunk) ============
|
| 528 |
else:
|
| 529 |
st.info(
|
| 530 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
|
|
|
| 534 |
progress_bar = st.progress(0)
|
| 535 |
status_text = st.empty()
|
| 536 |
|
| 537 |
+
# Preprocessing
|
| 538 |
cleaned_text_list = []
|
| 539 |
total_preprocessing = len(df_uploaded)
|
| 540 |
|
|
|
|
| 552 |
status_text.text("Memulai prediksi...")
|
| 553 |
time.sleep(0.3)
|
| 554 |
|
| 555 |
+
# Batch Prediction
|
| 556 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 557 |
num_sents = len(cleaned_text_list)
|
| 558 |
num_asps = len(ASPEK_COLUMNS)
|
|
|
|
| 592 |
status_text.text(
|
| 593 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 594 |
|
| 595 |
+
# Combine results
|
| 596 |
result_list = []
|
| 597 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 598 |
row_dict = data_row.to_dict()
|
|
|
|
| 612 |
end_time = time.time()
|
| 613 |
duration = end_time - start_time
|
| 614 |
|
| 615 |
+
# Simpan hasil ke session state dan cache
|
| 616 |
st.session_state.df_predicted = df_session
|
| 617 |
df_session.to_csv(session_result_file, index=False)
|
| 618 |
|
| 619 |
+
# Simpan metadata file
|
| 620 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 621 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 622 |
f.write(uploaded_file.name)
|
| 623 |
|
| 624 |
+
# Hitung statistik processing
|
| 625 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 626 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 627 |
|
| 628 |
+
# Tampilkan hasil processing
|
| 629 |
if use_chunked:
|
| 630 |
st.success(
|
| 631 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
|
|
| 645 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 646 |
)
|
| 647 |
|
| 648 |
+
# ================== TAMPILAN HASIL & VISUALISASI ==================
|
| 649 |
if st.session_state.df_predicted is not None:
|
| 650 |
df_predicted = st.session_state.df_predicted
|
| 651 |
|
| 652 |
+
# Deteksi kolom yang tersedia dalam dataframe
|
| 653 |
available_cols = get_available_columns(df_predicted)
|
| 654 |
|
| 655 |
+
# ============ SIDEBAR FILTER ============
|
| 656 |
st.sidebar.header("Filter Data")
|
| 657 |
|
| 658 |
df_clean = df_predicted.copy()
|
|
|
|
| 664 |
st.sidebar.info(
|
| 665 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 666 |
|
| 667 |
+
# Filter Mata Kuliah (jika ada)
|
| 668 |
selected_matkul = []
|
| 669 |
if available_cols['has_matkul']:
|
| 670 |
matkul_options = sorted(
|
|
|
|
| 673 |
selected_matkul = st.sidebar.multiselect(
|
| 674 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 675 |
|
| 676 |
+
# Filter Program Studi (jika ada)
|
| 677 |
selected_prodi = []
|
| 678 |
if available_cols['has_prodi']:
|
| 679 |
prodi_options = sorted(
|
|
|
|
| 682 |
selected_prodi = st.sidebar.multiselect(
|
| 683 |
"Program Studi", prodi_options, default=prodi_options)
|
| 684 |
|
| 685 |
+
# Filter Tahun (jika ada)
|
| 686 |
selected_tahun = []
|
| 687 |
if available_cols['has_tahun']:
|
| 688 |
+
# Konversi tanggal ke tahun jika perlu
|
| 689 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
| 690 |
df_clean['tahun'] = pd.to_datetime(
|
| 691 |
df_clean['tanggal'], errors='coerce').dt.year
|
|
|
|
| 696 |
selected_tahun = st.sidebar.multiselect(
|
| 697 |
"Tahun", tahun_options, default=tahun_options)
|
| 698 |
|
| 699 |
+
# Filter Semester (jika ada)
|
| 700 |
selected_semester = []
|
| 701 |
if available_cols['has_semester']:
|
| 702 |
semester_options = sorted(
|
|
|
|
| 705 |
selected_semester = st.sidebar.multiselect(
|
| 706 |
"Semester", semester_options, default=semester_options)
|
| 707 |
|
| 708 |
+
# Apply semua filter yang dipilih
|
| 709 |
df_filtered = df_clean.copy()
|
| 710 |
|
| 711 |
if selected_matkul and available_cols['has_matkul']:
|
|
|
|
| 714 |
|
| 715 |
if selected_prodi and available_cols['has_prodi']:
|
| 716 |
df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
|
| 717 |
+
selected_prodi
|
| 718 |
+
if selected_prodi and available_cols['has_prodi']:
|
| 719 |
+
df_filtered=df_filtered[df_filtered["nama_prodi"].isin(
|
| 720 |
+
selected_prodi)]
|
| 721 |
+
|
| 722 |
+
if selected_tahun and available_cols['has_tahun']:
|
| 723 |
+
df_filtered=df_filtered[df_filtered["tahun"].isin(selected_tahun)]
|
| 724 |
+
|
| 725 |
+
if selected_semester and available_cols['has_semester']:
|
| 726 |
+
df_filtered=df_filtered[df_filtered["semester"].isin(
|
| 727 |
+
selected_semester)]
|
| 728 |
+
|
| 729 |
+
# ============ TAMPILAN TABEL HASIL ============
|
| 730 |
+
st.markdown("### Tabel Data Hasil Prediksi")
|
| 731 |
+
st.dataframe(df_filtered, width='stretch')
|
| 732 |
+
|
| 733 |
+
# ============ TOMBOL DOWNLOAD ============
|
| 734 |
+
col_dl1, col_dl2=st.columns(2)
|
| 735 |
+
with col_dl1:
|
| 736 |
+
# Download data terfilter
|
| 737 |
+
st.download_button(
|
| 738 |
+
label="Unduh Data Terfilter",
|
| 739 |
+
data=convert_df_to_excel(df_filtered),
|
| 740 |
+
file_name="hasil_prediksi_absa_filtered.xlsx",
|
| 741 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 742 |
+
use_container_width=True
|
| 743 |
+
)
|
| 744 |
+
|
| 745 |
+
with col_dl2:
|
| 746 |
+
# Download semua data tanpa filter
|
| 747 |
+
st.download_button(
|
| 748 |
+
label="Unduh Semua Data",
|
| 749 |
+
data=convert_df_to_excel(df_predicted),
|
| 750 |
+
file_name="hasil_prediksi_absa_all.xlsx",
|
| 751 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 752 |
+
use_container_width=True
|
| 753 |
+
)
|
| 754 |
+
|
| 755 |
+
st.info(
|
| 756 |
+
f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
|
| 757 |
+
)
|
| 758 |
+
|
| 759 |
+
# ============ RINGKASAN CEPAT ============
|
| 760 |
+
st.markdown("")
|
| 761 |
+
st.markdown("### Ringkasan Cepat")
|
| 762 |
+
st.markdown("")
|
| 763 |
+
|
| 764 |
+
# Hitung total sentimen dari semua aspek
|
| 765 |
+
total_pos=(df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
|
| 766 |
+
total_net=(df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
|
| 767 |
+
total_neg=(df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
|
| 768 |
+
|
| 769 |
+
# Tentukan kolom mana yang tersedia untuk ditampilkan
|
| 770 |
+
summary_cols=[]
|
| 771 |
+
|
| 772 |
+
# Kolom dasar (selalu ada)
|
| 773 |
+
summary_cols.extend(['ulasan', 'aspek'])
|
| 774 |
+
|
| 775 |
+
# Kolom opsional berdasarkan data yang tersedia
|
| 776 |
+
if available_cols['has_matkul']:
|
| 777 |
+
summary_cols.append('matkul')
|
| 778 |
+
if available_cols['has_prodi']:
|
| 779 |
+
summary_cols.append('prodi')
|
| 780 |
+
if available_cols['has_semester']:
|
| 781 |
+
summary_cols.append('semester')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
+
# Buat kolom dinamis berdasarkan jumlah metrik
|
| 784 |
+
num_cols=len(summary_cols)
|
| 785 |
+
cols=st.columns(num_cols)
|
|
|
|
|
|
|
| 786 |
|
| 787 |
+
col_idx=0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 788 |
|
| 789 |
+
# Metrik: Ulasan & Aspek (selalu ada)
|
| 790 |
+
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
|
| 791 |
+
col_idx += 1
|
| 792 |
+
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 793 |
+
col_idx += 1
|
| 794 |
|
| 795 |
+
# Metrik: Mata Kuliah (jika ada)
|
| 796 |
+
if available_cols['has_matkul']:
|
| 797 |
+
matkul_count=df_filtered['nama_matakuliah'].nunique()
|
| 798 |
+
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
|
| 799 |
+
col_idx += 1
|
| 800 |
|
| 801 |
+
# Metrik: Prodi (jika ada)
|
| 802 |
+
if available_cols['has_prodi']:
|
| 803 |
+
prodi_count=df_filtered['nama_prodi'].nunique()
|
| 804 |
+
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 805 |
+
col_idx += 1
|
| 806 |
|
| 807 |
+
# Metrik: Semester (jika ada)
|
| 808 |
+
if available_cols['has_semester']:
|
| 809 |
+
semester_count=df_filtered['semester'].nunique()
|
| 810 |
+
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
|
| 811 |
+
col_idx += 1
|
| 812 |
|
| 813 |
+
st.markdown("")
|
| 814 |
|
| 815 |
+
# Baris kedua: Sentimen + info tambahan
|
| 816 |
+
summary_cols2=['positif', 'netral', 'negatif']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
+
if available_cols['has_tahun']:
|
| 819 |
+
summary_cols2.append('tahun')
|
| 820 |
+
if 'kritik_saran' in df_filtered.columns:
|
| 821 |
+
summary_cols2.append('kata')
|
| 822 |
+
|
| 823 |
+
cols2=st.columns(len(summary_cols2))
|
| 824 |
+
|
| 825 |
+
col_idx2=0
|
| 826 |
+
# Metrik: Sentimen Positif, Netral, Negatif
|
| 827 |
+
cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
|
| 828 |
+
col_idx2 += 1
|
| 829 |
+
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
|
| 830 |
+
col_idx2 += 1
|
| 831 |
+
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
|
| 832 |
+
col_idx2 += 1
|
| 833 |
+
|
| 834 |
+
# Metrik: Rentang tahun (jika ada)
|
| 835 |
+
if available_cols['has_tahun']:
|
| 836 |
+
if 'tahun' in df_filtered.columns:
|
| 837 |
+
tahun_valid=df_filtered['tahun'].dropna()
|
| 838 |
if len(tahun_valid) > 0:
|
| 839 |
+
tahun_min=int(tahun_valid.min())
|
| 840 |
+
tahun_max=int(tahun_valid.max())
|
| 841 |
if tahun_min == tahun_max:
|
| 842 |
cols2[col_idx2].metric("Tahun", f"{tahun_min}")
|
| 843 |
else:
|
|
|
|
| 845 |
"Rentang Tahun", f"{tahun_min} - {tahun_max}")
|
| 846 |
else:
|
| 847 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 848 |
+
else:
|
| 849 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 850 |
+
col_idx2 += 1
|
| 851 |
|
| 852 |
+
# Metrik: Rata-rata panjang kata (jika kolom kritik_saran ada)
|
| 853 |
+
if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
|
| 854 |
+
try:
|
| 855 |
+
word_counts=df_filtered['kritik_saran'].astype(
|
| 856 |
str).str.split().str.len()
|
| 857 |
+
avg_word_count=round(word_counts.mean(), 1)
|
| 858 |
cols2[col_idx2].metric(
|
| 859 |
"Rata-rata Panjang Kata", f"{avg_word_count} kata")
|
| 860 |
+
except Exception:
|
| 861 |
cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
|
| 862 |
|
| 863 |
+
# ============ VISUALISASI DATA ============
|
| 864 |
+
st.markdown("---")
|
| 865 |
+
st.markdown("### Visualisasi Data")
|
| 866 |
|
| 867 |
+
# Visualisasi Sentimen Dasar (selalu ditampilkan)
|
| 868 |
+
col1, col2=st.columns(2)
|
| 869 |
+
with col1:
|
| 870 |
+
show_sentiment_bar_chart(df_filtered, ASPEK_COLUMNS)
|
| 871 |
+
with col2:
|
| 872 |
+
show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
|
| 873 |
|
| 874 |
+
# Visualisasi berdasarkan kolom yang tersedia
|
| 875 |
+
viz_shown=False
|
| 876 |
|
| 877 |
+
# Visualisasi: Distribusi Tahun & Semester
|
| 878 |
+
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 879 |
+
col1, col2=st.columns(2)
|
| 880 |
+
with col1:
|
| 881 |
if available_cols['has_tahun']:
|
| 882 |
+
result=show_year_distribution(df_filtered)
|
| 883 |
if result:
|
| 884 |
+
viz_shown=True
|
| 885 |
+
with col2:
|
| 886 |
if available_cols['has_semester']:
|
| 887 |
+
result=show_semester_distribution(df_filtered)
|
| 888 |
if result:
|
| 889 |
+
viz_shown=True
|
| 890 |
+
|
| 891 |
+
# Visualisasi: Distribusi Prodi
|
| 892 |
+
if available_cols['has_prodi']:
|
| 893 |
+
st.markdown("---")
|
| 894 |
+
result=show_prodi_distribution(df_filtered)
|
| 895 |
+
if result:
|
| 896 |
+
viz_shown=True
|
| 897 |
+
|
| 898 |
+
# Visualisasi: Distribusi Top 10 Mata Kuliah
|
| 899 |
+
if available_cols['has_matkul']:
|
| 900 |
+
st.markdown("---")
|
| 901 |
+
result=show_top10_matkul_distribution(df_filtered)
|
| 902 |
+
if result:
|
| 903 |
+
viz_shown=True
|
| 904 |
+
|
| 905 |
+
# Visualisasi: Sentimen per Tahun/Semester
|
| 906 |
+
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 907 |
+
st.markdown("---")
|
| 908 |
+
col1, col2=st.columns(2)
|
| 909 |
+
with col1:
|
| 910 |
if available_cols['has_tahun']:
|
| 911 |
+
result=show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
|
| 912 |
if result:
|
| 913 |
+
viz_shown=True
|
| 914 |
+
with col2:
|
| 915 |
if available_cols['has_semester']:
|
| 916 |
+
result=show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
|
| 917 |
if result:
|
| 918 |
+
viz_shown=True
|
| 919 |
+
|
| 920 |
+
# Visualisasi: Sentimen per Prodi
|
| 921 |
+
if available_cols['has_prodi']:
|
| 922 |
+
st.markdown("---")
|
| 923 |
+
result=show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
|
| 924 |
+
if result:
|
| 925 |
+
viz_shown=True
|
| 926 |
+
|
| 927 |
+
# Visualisasi: Sentimen per Top 10 Mata Kuliah
|
| 928 |
+
if available_cols['has_matkul']:
|
| 929 |
+
st.markdown("---")
|
| 930 |
+
result=show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
|
| 931 |
+
if result:
|
| 932 |
+
viz_shown=True
|
| 933 |
+
|
| 934 |
+
# ============ FOOTER ============
|
| 935 |
+
st.caption("""
|
| 936 |
<div class='footer'>
|
| 937 |
+
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
| 938 |
</div>
|
| 939 |
""", unsafe_allow_html=True)
|
config.py
CHANGED
|
@@ -12,7 +12,6 @@ CONFIG = {
|
|
| 12 |
"hf_model_repo": "zdannn2808/absa_indobert",
|
| 13 |
"hf_model_subfolder": "Indobert_Model/model",
|
| 14 |
"hf_tokenizer_subfolder": "Indobert_Model/tokenizer",
|
| 15 |
-
# ✅ TAMBAHAN: Konfigurasi untuk batch processing
|
| 16 |
"batch_size": 32, # Ukuran batch untuk prediksi
|
| 17 |
"num_workers": 0, # Jumlah worker untuk DataLoader (0 = main process only)
|
| 18 |
}
|
|
|
|
| 12 |
"hf_model_repo": "zdannn2808/absa_indobert",
|
| 13 |
"hf_model_subfolder": "Indobert_Model/model",
|
| 14 |
"hf_tokenizer_subfolder": "Indobert_Model/tokenizer",
|
|
|
|
| 15 |
"batch_size": 32, # Ukuran batch untuk prediksi
|
| 16 |
"num_workers": 0, # Jumlah worker untuk DataLoader (0 = main process only)
|
| 17 |
}
|
preprocessing.py
CHANGED
|
@@ -7,9 +7,11 @@ Modul untuk preprocessing teks sebelum prediksi ABSA
|
|
| 7 |
import re
|
| 8 |
import string
|
| 9 |
|
| 10 |
-
#
|
|
|
|
| 11 |
try:
|
| 12 |
import nltk
|
|
|
|
| 13 |
try:
|
| 14 |
nltk.data.find('tokenizers/punkt')
|
| 15 |
except LookupError:
|
|
@@ -17,16 +19,18 @@ try:
|
|
| 17 |
nltk.download('punkt', quiet=True)
|
| 18 |
print("✅ NLTK punkt downloaded")
|
| 19 |
|
| 20 |
-
#
|
| 21 |
try:
|
| 22 |
nltk.data.find('tokenizers/punkt_tab')
|
| 23 |
except LookupError:
|
| 24 |
nltk.download('punkt_tab', quiet=True)
|
| 25 |
except ImportError:
|
|
|
|
| 26 |
print("⚠️ NLTK tidak terinstall, menggunakan tokenizer sederhana")
|
| 27 |
|
| 28 |
|
| 29 |
-
#
|
|
|
|
| 30 |
INDONESIAN_STOPWORDS = set([
|
| 31 |
'ada', 'adalah', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 'akankah', 'akhir',
|
| 32 |
'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 'anda', 'andalah', 'antar', 'antara',
|
|
@@ -124,20 +128,23 @@ INDONESIAN_STOPWORDS = set([
|
|
| 124 |
|
| 125 |
def simple_tokenize(text):
|
| 126 |
"""
|
| 127 |
-
Tokenizer sederhana dengan split by whitespace
|
|
|
|
| 128 |
|
| 129 |
Args:
|
| 130 |
text (str): Teks input
|
| 131 |
|
| 132 |
Returns:
|
| 133 |
-
list: List of tokens
|
| 134 |
"""
|
|
|
|
| 135 |
return text.split()
|
| 136 |
|
| 137 |
|
| 138 |
def remove_emoji(text):
|
| 139 |
"""
|
| 140 |
-
Menghapus emoji dari teks
|
|
|
|
| 141 |
|
| 142 |
Args:
|
| 143 |
text (str): Teks input
|
|
@@ -145,23 +152,26 @@ def remove_emoji(text):
|
|
| 145 |
Returns:
|
| 146 |
str: Teks tanpa emoji
|
| 147 |
"""
|
|
|
|
| 148 |
emoji_pattern = re.compile(
|
| 149 |
"["
|
| 150 |
-
"\U0001F600-\U0001F64F" #
|
| 151 |
-
"\U0001F300-\U0001F5FF" #
|
| 152 |
-
"\U0001F680-\U0001F6FF" #
|
| 153 |
-
"\U0001F1E0-\U0001F1FF" #
|
| 154 |
-
"\U00002702-\U000027B0"
|
| 155 |
-
"\U000024C2-\U0001F251"
|
| 156 |
"]+",
|
| 157 |
flags=re.UNICODE,
|
| 158 |
)
|
|
|
|
| 159 |
return emoji_pattern.sub(r"", text)
|
| 160 |
|
| 161 |
|
| 162 |
def cleaning_text(text):
|
| 163 |
"""
|
| 164 |
-
Membersihkan teks dari
|
|
|
|
| 165 |
|
| 166 |
Args:
|
| 167 |
text (str): Teks input
|
|
@@ -169,36 +179,38 @@ def cleaning_text(text):
|
|
| 169 |
Returns:
|
| 170 |
str: Teks yang sudah dibersihkan
|
| 171 |
"""
|
|
|
|
| 172 |
if not isinstance(text, str):
|
| 173 |
text = str(text)
|
| 174 |
-
|
| 175 |
-
#
|
| 176 |
text = text.lower()
|
| 177 |
-
|
| 178 |
-
#
|
| 179 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
| 180 |
-
|
| 181 |
-
#
|
| 182 |
text = re.sub(r'@\w+|#\w+', '', text)
|
| 183 |
-
|
| 184 |
-
#
|
| 185 |
text = re.sub(r'\S+@\S+', '', text)
|
| 186 |
-
|
| 187 |
-
#
|
| 188 |
text = re.sub(r'\d+', '', text)
|
| 189 |
-
|
| 190 |
-
#
|
| 191 |
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
| 192 |
-
|
| 193 |
-
#
|
| 194 |
text = ' '.join(text.split())
|
| 195 |
-
|
| 196 |
return text
|
| 197 |
|
| 198 |
|
| 199 |
def normalize_text(tokens):
|
| 200 |
"""
|
| 201 |
-
Normalisasi token: hapus punctuation dan
|
|
|
|
| 202 |
|
| 203 |
Args:
|
| 204 |
tokens (list): List of tokens
|
|
@@ -206,42 +218,45 @@ def normalize_text(tokens):
|
|
| 206 |
Returns:
|
| 207 |
list: List of normalized tokens
|
| 208 |
"""
|
| 209 |
-
#
|
| 210 |
-
tokens = [token.translate(str.maketrans(
|
| 211 |
-
|
| 212 |
-
|
|
|
|
| 213 |
tokens = [token for token in tokens if token]
|
| 214 |
-
|
| 215 |
-
#
|
| 216 |
tokens = [token for token in tokens if token not in INDONESIAN_STOPWORDS]
|
| 217 |
-
|
| 218 |
-
#
|
| 219 |
tokens = [token for token in tokens if len(token) > 1]
|
| 220 |
-
|
| 221 |
return tokens
|
| 222 |
|
| 223 |
|
| 224 |
def text_preprocessing_pipeline(text):
|
| 225 |
"""
|
| 226 |
-
Pipeline preprocessing lengkap
|
|
|
|
|
|
|
| 227 |
|
| 228 |
Args:
|
| 229 |
-
text (str): Teks input
|
| 230 |
|
| 231 |
Returns:
|
| 232 |
-
str: Teks yang sudah dipreprocess
|
| 233 |
"""
|
| 234 |
-
# Cleaning
|
| 235 |
text = cleaning_text(text)
|
| 236 |
-
|
| 237 |
-
# Remove emoji
|
| 238 |
text = remove_emoji(text)
|
| 239 |
-
|
| 240 |
-
# Tokenize (
|
| 241 |
tokens = simple_tokenize(text)
|
| 242 |
-
|
| 243 |
-
# Normalize (
|
| 244 |
tokens = normalize_text(tokens)
|
| 245 |
-
|
| 246 |
-
# Join
|
| 247 |
-
return " ".join(tokens)
|
|
|
|
| 7 |
import re
|
| 8 |
import string
|
| 9 |
|
| 10 |
+
# === SETUP NLTK (OPTIONAL) ===
|
| 11 |
+
# Download NLTK data jika diperlukan untuk tokenisasi advanced
|
| 12 |
try:
|
| 13 |
import nltk
|
| 14 |
+
# Cek apakah punkt tokenizer sudah terinstall
|
| 15 |
try:
|
| 16 |
nltk.data.find('tokenizers/punkt')
|
| 17 |
except LookupError:
|
|
|
|
| 19 |
nltk.download('punkt', quiet=True)
|
| 20 |
print("✅ NLTK punkt downloaded")
|
| 21 |
|
| 22 |
+
# Cek punkt_tab untuk versi NLTK terbaru
|
| 23 |
try:
|
| 24 |
nltk.data.find('tokenizers/punkt_tab')
|
| 25 |
except LookupError:
|
| 26 |
nltk.download('punkt_tab', quiet=True)
|
| 27 |
except ImportError:
|
| 28 |
+
# Jika NLTK tidak terinstall, gunakan tokenizer sederhana
|
| 29 |
print("⚠️ NLTK tidak terinstall, menggunakan tokenizer sederhana")
|
| 30 |
|
| 31 |
|
| 32 |
+
# === STOPWORDS BAHASA INDONESIA ===
|
| 33 |
+
# Daftar kata yang tidak memiliki makna signifikan dan dapat dihapus
|
| 34 |
INDONESIAN_STOPWORDS = set([
|
| 35 |
'ada', 'adalah', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 'akankah', 'akhir',
|
| 36 |
'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 'anda', 'andalah', 'antar', 'antara',
|
|
|
|
| 128 |
|
| 129 |
def simple_tokenize(text):
|
| 130 |
"""
|
| 131 |
+
Tokenizer sederhana dengan split by whitespace.
|
| 132 |
+
Digunakan sebagai alternatif jika NLTK tidak tersedia.
|
| 133 |
|
| 134 |
Args:
|
| 135 |
text (str): Teks input
|
| 136 |
|
| 137 |
Returns:
|
| 138 |
+
list: List of tokens (kata-kata)
|
| 139 |
"""
|
| 140 |
+
# Split teks berdasarkan spasi
|
| 141 |
return text.split()
|
| 142 |
|
| 143 |
|
| 144 |
def remove_emoji(text):
|
| 145 |
"""
|
| 146 |
+
Menghapus emoji dari teks menggunakan regex pattern.
|
| 147 |
+
Emoji dapat mengganggu analisis sentimen karena tidak diproses model.
|
| 148 |
|
| 149 |
Args:
|
| 150 |
text (str): Teks input
|
|
|
|
| 152 |
Returns:
|
| 153 |
str: Teks tanpa emoji
|
| 154 |
"""
|
| 155 |
+
# Pattern regex untuk mendeteksi berbagai range emoji Unicode
|
| 156 |
emoji_pattern = re.compile(
|
| 157 |
"["
|
| 158 |
+
"\U0001F600-\U0001F64F" # Emoticons (😀-😯)
|
| 159 |
+
"\U0001F300-\U0001F5FF" # Symbols & pictographs (🌀-🗿)
|
| 160 |
+
"\U0001F680-\U0001F6FF" # Transport & map symbols (🚀-)
|
| 161 |
+
"\U0001F1E0-\U0001F1FF" # Flags (🇦-🇿)
|
| 162 |
+
"\U00002702-\U000027B0" # Dingbats
|
| 163 |
+
"\U000024C2-\U0001F251" # Enclosed characters
|
| 164 |
"]+",
|
| 165 |
flags=re.UNICODE,
|
| 166 |
)
|
| 167 |
+
# Hapus semua emoji yang terdeteksi
|
| 168 |
return emoji_pattern.sub(r"", text)
|
| 169 |
|
| 170 |
|
| 171 |
def cleaning_text(text):
|
| 172 |
"""
|
| 173 |
+
Membersihkan teks dari berbagai elemen yang tidak diperlukan.
|
| 174 |
+
Proses: lowercase, hapus URL, mention, hashtag, email, angka, dan whitespace berlebih.
|
| 175 |
|
| 176 |
Args:
|
| 177 |
text (str): Teks input
|
|
|
|
| 179 |
Returns:
|
| 180 |
str: Teks yang sudah dibersihkan
|
| 181 |
"""
|
| 182 |
+
# Pastikan input adalah string
|
| 183 |
if not isinstance(text, str):
|
| 184 |
text = str(text)
|
| 185 |
+
|
| 186 |
+
# Konversi ke lowercase untuk konsistensi
|
| 187 |
text = text.lower()
|
| 188 |
+
|
| 189 |
+
# Hapus URL (http, https, www)
|
| 190 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
| 191 |
+
|
| 192 |
+
# Hapus mention (@username) dan hashtag (#topic)
|
| 193 |
text = re.sub(r'@\w+|#\w+', '', text)
|
| 194 |
+
|
| 195 |
+
# Hapus alamat email
|
| 196 |
text = re.sub(r'\S+@\S+', '', text)
|
| 197 |
+
|
| 198 |
+
# Hapus angka (bisa mengganggu analisis sentimen)
|
| 199 |
text = re.sub(r'\d+', '', text)
|
| 200 |
+
|
| 201 |
+
# Hapus newline, carriage return, dan tab
|
| 202 |
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
| 203 |
+
|
| 204 |
+
# Hapus extra whitespace (multiple spaces jadi single space)
|
| 205 |
text = ' '.join(text.split())
|
| 206 |
+
|
| 207 |
return text
|
| 208 |
|
| 209 |
|
| 210 |
def normalize_text(tokens):
|
| 211 |
"""
|
| 212 |
+
Normalisasi token: hapus punctuation, stopwords, dan filter token.
|
| 213 |
+
Token yang baik adalah yang bermakna dan membantu analisis sentimen.
|
| 214 |
|
| 215 |
Args:
|
| 216 |
tokens (list): List of tokens
|
|
|
|
| 218 |
Returns:
|
| 219 |
list: List of normalized tokens
|
| 220 |
"""
|
| 221 |
+
# Hapus punctuation dari setiap token (.,!?;: dll)
|
| 222 |
+
tokens = [token.translate(str.maketrans(
|
| 223 |
+
'', '', string.punctuation)) for token in tokens]
|
| 224 |
+
|
| 225 |
+
# Hapus empty strings hasil penghapusan punctuation
|
| 226 |
tokens = [token for token in tokens if token]
|
| 227 |
+
|
| 228 |
+
# Hapus stopwords (kata-kata umum yang tidak bermakna)
|
| 229 |
tokens = [token for token in tokens if token not in INDONESIAN_STOPWORDS]
|
| 230 |
+
|
| 231 |
+
# Hapus single character (biasanya tidak bermakna)
|
| 232 |
tokens = [token for token in tokens if len(token) > 1]
|
| 233 |
+
|
| 234 |
return tokens
|
| 235 |
|
| 236 |
|
| 237 |
def text_preprocessing_pipeline(text):
|
| 238 |
"""
|
| 239 |
+
Pipeline preprocessing lengkap untuk teks sebelum prediksi ABSA.
|
| 240 |
+
Menggabungkan semua langkah preprocessing: cleaning -> emoji removal ->
|
| 241 |
+
tokenization -> normalization.
|
| 242 |
|
| 243 |
Args:
|
| 244 |
+
text (str): Teks input mentah
|
| 245 |
|
| 246 |
Returns:
|
| 247 |
+
str: Teks yang sudah dipreprocess dan siap untuk model
|
| 248 |
"""
|
| 249 |
+
# Step 1: Cleaning (lowercase, hapus URL, mention, dll)
|
| 250 |
text = cleaning_text(text)
|
| 251 |
+
|
| 252 |
+
# Step 2: Remove emoji
|
| 253 |
text = remove_emoji(text)
|
| 254 |
+
|
| 255 |
+
# Step 3: Tokenize (split menjadi kata-kata)
|
| 256 |
tokens = simple_tokenize(text)
|
| 257 |
+
|
| 258 |
+
# Step 4: Normalize (hapus stopwords, punctuation, dll)
|
| 259 |
tokens = normalize_text(tokens)
|
| 260 |
+
|
| 261 |
+
# Step 5: Join kembali menjadi string
|
| 262 |
+
return " ".join(tokens)
|
visualization.py
CHANGED
|
@@ -13,135 +13,241 @@ import plotly.express as px
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
-
# Palet warna kustom
|
| 17 |
sentimen_palette = {
|
| 18 |
-
"netral": "#FFE24C",
|
| 19 |
-
"positif": "#4CFF72",
|
| 20 |
-
"negatif": "#FF4C4C"
|
| 21 |
}
|
|
|
|
|
|
|
| 22 |
category_order = ["netral", "positif", "negatif"]
|
| 23 |
|
| 24 |
-
# Konfigurasi Plotly
|
| 25 |
config_options = {
|
| 26 |
-
"scrollZoom": False,
|
| 27 |
-
"displayModeBar": False
|
| 28 |
}
|
| 29 |
|
| 30 |
|
| 31 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 32 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 34 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 35 |
return
|
| 36 |
|
|
|
|
| 37 |
df_long = df_predicted.melt(
|
| 38 |
value_vars=aspek_columns,
|
| 39 |
var_name="aspek",
|
| 40 |
value_name="sentimen"
|
| 41 |
)
|
|
|
|
|
|
|
| 42 |
df_long["sentimen"] = pd.Categorical(
|
| 43 |
df_long["sentimen"],
|
| 44 |
categories=category_order,
|
| 45 |
ordered=True
|
| 46 |
)
|
|
|
|
|
|
|
| 47 |
count_data = df_long.groupby(
|
| 48 |
["aspek", "sentimen"], observed=False
|
| 49 |
).size().reset_index(name="jumlah")
|
|
|
|
|
|
|
| 50 |
fig = px.bar(
|
| 51 |
count_data,
|
| 52 |
x="aspek",
|
| 53 |
y="jumlah",
|
| 54 |
color="sentimen",
|
| 55 |
-
barmode="group",
|
| 56 |
color_discrete_map=sentimen_palette,
|
| 57 |
category_orders={"sentimen": category_order}
|
| 58 |
)
|
| 59 |
fig.update_layout(title="Distribusi Sentimen per Aspek")
|
|
|
|
|
|
|
| 60 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 61 |
|
| 62 |
|
| 63 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 64 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
|
|
|
|
|
|
| 66 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 67 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 68 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
fig.update_layout(title="Total Komposisi Sentimen")
|
|
|
|
|
|
|
| 73 |
fig.update_traces(textposition='inside', textinfo='percent+label')
|
|
|
|
| 74 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 75 |
|
| 76 |
|
| 77 |
def show_year_distribution(df):
|
| 78 |
-
"""
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 81 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 82 |
|
|
|
|
| 83 |
if 'tahun' not in df.columns:
|
| 84 |
-
return None
|
| 85 |
|
|
|
|
| 86 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 87 |
if df_tahun.empty:
|
| 88 |
return None
|
| 89 |
|
|
|
|
| 90 |
df_tahun['tahun'] = df_tahun['tahun'].astype(int)
|
|
|
|
|
|
|
| 91 |
year_counts = df_tahun['tahun'].value_counts().reset_index()
|
| 92 |
year_counts.columns = ['tahun', 'jumlah']
|
| 93 |
year_counts = year_counts.sort_values('jumlah', ascending=False)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
fig.update_layout(xaxis=dict(type='category'))
|
|
|
|
| 98 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 99 |
return True
|
| 100 |
|
| 101 |
|
| 102 |
def show_semester_distribution(df):
|
| 103 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if 'semester' not in df.columns:
|
| 105 |
return None
|
| 106 |
|
|
|
|
| 107 |
semester_counts = df['semester'].value_counts().reset_index()
|
| 108 |
semester_counts.columns = ['semester', 'jumlah']
|
| 109 |
semester_counts = semester_counts.sort_values('jumlah', ascending=False)
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
fig.update_layout(xaxis=dict(categoryorder='total descending'))
|
|
|
|
| 113 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 114 |
return True
|
| 115 |
|
| 116 |
|
| 117 |
def show_prodi_distribution(df):
|
| 118 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
if 'nama_prodi' not in df.columns:
|
| 120 |
return None
|
| 121 |
|
|
|
|
| 122 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 123 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
|
|
|
|
|
|
| 124 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
|
|
|
|
|
|
| 125 |
fig = px.bar(
|
| 126 |
prodi_counts,
|
| 127 |
x='jumlah',
|
| 128 |
y='nama_prodi',
|
| 129 |
-
orientation='h',
|
| 130 |
color='jumlah',
|
| 131 |
title="Jumlah Kritik/Saran per Program Studi"
|
| 132 |
)
|
|
|
|
| 133 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 134 |
return True
|
| 135 |
|
| 136 |
|
| 137 |
def show_top10_matkul_distribution(df):
|
| 138 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
required_cols = ['nama_matakuliah', 'kode_matakuliah']
|
| 140 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 141 |
|
| 142 |
if missing_cols:
|
| 143 |
return None
|
| 144 |
|
|
|
|
| 145 |
matkul_counts = (
|
| 146 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 147 |
.size()
|
|
@@ -149,12 +255,17 @@ def show_top10_matkul_distribution(df):
|
|
| 149 |
.sort_values(by='jumlah', ascending=False)
|
| 150 |
.head(10)
|
| 151 |
)
|
|
|
|
|
|
|
| 152 |
matkul_counts['label'] = (
|
| 153 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 154 |
matkul_counts['nama_matakuliah']
|
| 155 |
)
|
|
|
|
|
|
|
| 156 |
matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
|
| 157 |
|
|
|
|
| 158 |
fig = px.bar(
|
| 159 |
matkul_counts,
|
| 160 |
x='jumlah',
|
|
@@ -163,60 +274,124 @@ def show_top10_matkul_distribution(df):
|
|
| 163 |
title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
|
| 164 |
color='jumlah'
|
| 165 |
)
|
|
|
|
| 166 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 167 |
return True
|
| 168 |
|
| 169 |
|
| 170 |
def show_sentiment_by_year(df, aspek_columns):
|
| 171 |
-
"""
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 174 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 175 |
|
|
|
|
| 176 |
if 'tahun' not in df.columns:
|
| 177 |
return None
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
year_sentiment = df_long.groupby(
|
| 184 |
['tahun', 'sentimen'], observed=False
|
| 185 |
).size().reset_index(name='jumlah')
|
|
|
|
| 186 |
year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
|
|
|
|
| 190 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 191 |
return True
|
| 192 |
|
| 193 |
|
| 194 |
def show_sentiment_by_semester(df, aspek_columns):
|
| 195 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
if 'semester' not in df.columns:
|
| 197 |
return None
|
| 198 |
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
semester_sentiment = df_long.groupby(
|
| 204 |
['semester', 'sentimen'], observed=False
|
| 205 |
).size().reset_index(name='jumlah')
|
|
|
|
| 206 |
semester_sentiment = semester_sentiment.sort_values(
|
| 207 |
'jumlah', ascending=False)
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
|
|
|
|
| 211 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 212 |
return True
|
| 213 |
|
| 214 |
|
| 215 |
def show_sentiment_by_prodi(df, aspek_columns):
|
| 216 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if 'nama_prodi' not in df.columns:
|
| 218 |
return None
|
| 219 |
|
|
|
|
| 220 |
df_long = df.melt(
|
| 221 |
id_vars=['nama_prodi'],
|
| 222 |
value_vars=aspek_columns,
|
|
@@ -224,51 +399,72 @@ def show_sentiment_by_prodi(df, aspek_columns):
|
|
| 224 |
value_name='sentimen'
|
| 225 |
)
|
| 226 |
|
|
|
|
| 227 |
prodi_sentiment = (
|
| 228 |
df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
|
| 229 |
.size()
|
| 230 |
.reset_index(name='jumlah')
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
total_per_prodi = (
|
| 234 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 235 |
.sum()
|
| 236 |
.sort_values(ascending=False)
|
| 237 |
)
|
|
|
|
|
|
|
| 238 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 239 |
|
|
|
|
| 240 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 241 |
prodi_sentiment['nama_prodi'],
|
| 242 |
categories=ordered_categories,
|
| 243 |
ordered=True
|
| 244 |
)
|
| 245 |
|
|
|
|
| 246 |
fig = px.bar(
|
| 247 |
prodi_sentiment,
|
| 248 |
y='nama_prodi',
|
| 249 |
x='jumlah',
|
| 250 |
color='sentimen',
|
| 251 |
barmode='group',
|
| 252 |
-
orientation='h',
|
| 253 |
color_discrete_map=sentimen_palette
|
| 254 |
)
|
| 255 |
fig.update_layout(
|
| 256 |
title="Distribusi Sentimen per Program Studi",
|
| 257 |
-
yaxis={
|
| 258 |
-
|
|
|
|
|
|
|
| 259 |
)
|
|
|
|
| 260 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 261 |
return True
|
| 262 |
|
| 263 |
|
| 264 |
def show_sentiment_by_top10_matkul(df, aspek_columns):
|
| 265 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
required_cols = ['kode_matakuliah', 'nama_matakuliah']
|
| 267 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 268 |
|
| 269 |
if missing_cols:
|
| 270 |
return None
|
| 271 |
|
|
|
|
| 272 |
df_top10 = (
|
| 273 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 274 |
.size()
|
|
@@ -277,9 +473,11 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 277 |
.index
|
| 278 |
)
|
| 279 |
|
|
|
|
| 280 |
df_filtered = df[df.set_index(
|
| 281 |
['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
|
| 282 |
|
|
|
|
| 283 |
df_long = df_filtered.melt(
|
| 284 |
id_vars=['kode_matakuliah', 'nama_matakuliah'],
|
| 285 |
value_vars=aspek_columns,
|
|
@@ -287,29 +485,36 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 287 |
value_name='sentimen'
|
| 288 |
)
|
| 289 |
|
|
|
|
| 290 |
df_long['label'] = (
|
| 291 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 292 |
)
|
| 293 |
|
|
|
|
| 294 |
matkul_sentiment = (
|
| 295 |
df_long.groupby(['label', 'sentimen'], observed=False)
|
| 296 |
.size()
|
| 297 |
.reset_index(name='jumlah')
|
| 298 |
)
|
| 299 |
|
|
|
|
| 300 |
total_per_label = (
|
| 301 |
matkul_sentiment.groupby('label')['jumlah']
|
| 302 |
.sum()
|
| 303 |
.sort_values(ascending=False)
|
| 304 |
)
|
|
|
|
|
|
|
| 305 |
ordered_labels = total_per_label.index.tolist()[::-1]
|
| 306 |
|
|
|
|
| 307 |
matkul_sentiment['label'] = pd.Categorical(
|
| 308 |
matkul_sentiment['label'],
|
| 309 |
categories=ordered_labels,
|
| 310 |
ordered=True
|
| 311 |
)
|
| 312 |
|
|
|
|
| 313 |
fig = px.bar(
|
| 314 |
matkul_sentiment,
|
| 315 |
y='label',
|
|
@@ -321,48 +526,11 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 321 |
)
|
| 322 |
fig.update_layout(
|
| 323 |
title="Distribusi Sentimen pada Top 10 Mata Kuliah",
|
| 324 |
-
yaxis={
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
return True
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
def show_sentiment_stacked_percentage(df, aspek_columns):
|
| 331 |
-
"""Menampilkan stacked bar chart dengan persentase sentimen per aspek."""
|
| 332 |
-
|
| 333 |
-
if df.empty or not set(aspek_columns).issubset(df.columns):
|
| 334 |
-
st.warning("Data atau kolom aspek tidak tersedia.")
|
| 335 |
-
return
|
| 336 |
-
|
| 337 |
-
df_long = df.melt(
|
| 338 |
-
value_vars=aspek_columns,
|
| 339 |
-
var_name="aspek",
|
| 340 |
-
value_name="sentimen"
|
| 341 |
-
)
|
| 342 |
-
|
| 343 |
-
# Hitung persentase
|
| 344 |
-
count_data = df_long.groupby(
|
| 345 |
-
['aspek', 'sentimen']).size().reset_index(name='jumlah')
|
| 346 |
-
total_per_aspek = count_data.groupby('aspek')['jumlah'].sum().reset_index()
|
| 347 |
-
total_per_aspek.columns = ['aspek', 'total']
|
| 348 |
-
count_data = count_data.merge(total_per_aspek, on='aspek')
|
| 349 |
-
count_data['persentase'] = (
|
| 350 |
-
count_data['jumlah'] / count_data['total']) * 100
|
| 351 |
-
|
| 352 |
-
fig = px.bar(
|
| 353 |
-
count_data,
|
| 354 |
-
x="aspek",
|
| 355 |
-
y="persentase",
|
| 356 |
-
color="sentimen",
|
| 357 |
-
title="Persentase Distribusi Sentimen per Aspek",
|
| 358 |
-
color_discrete_map=sentimen_palette,
|
| 359 |
-
category_orders={
|
| 360 |
-
"sentimen": category_order,
|
| 361 |
-
"aspek": aspek_columns
|
| 362 |
}
|
| 363 |
)
|
| 364 |
-
|
| 365 |
-
yaxis_title="Persentase (%)",
|
| 366 |
-
xaxis_title="Aspek"
|
| 367 |
-
)
|
| 368 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
|
|
|
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
+
# Palet warna kustom untuk setiap kategori sentimen
|
| 17 |
sentimen_palette = {
|
| 18 |
+
"netral": "#FFE24C", # Kuning untuk netral
|
| 19 |
+
"positif": "#4CFF72", # Hijau untuk positif
|
| 20 |
+
"negatif": "#FF4C4C" # Merah untuk negatif
|
| 21 |
}
|
| 22 |
+
|
| 23 |
+
# Urutan kategori sentimen untuk konsistensi visualisasi
|
| 24 |
category_order = ["netral", "positif", "negatif"]
|
| 25 |
|
| 26 |
+
# Konfigurasi Plotly untuk interaktivitas chart
|
| 27 |
config_options = {
|
| 28 |
+
"scrollZoom": False, # Nonaktifkan zoom dengan scroll
|
| 29 |
+
"displayModeBar": False # Sembunyikan toolbar Plotly
|
| 30 |
}
|
| 31 |
|
| 32 |
|
| 33 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 34 |
+
"""
|
| 35 |
+
Menampilkan bar chart distribusi sentimen per aspek.
|
| 36 |
+
Chart menampilkan jumlah setiap sentimen (positif/netral/negatif) untuk setiap aspek.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
|
| 40 |
+
aspek_columns (list): List nama kolom aspek yang akan divisualisasikan
|
| 41 |
+
"""
|
| 42 |
+
# Validasi: cek apakah data dan kolom aspek tersedia
|
| 43 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 44 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 45 |
return
|
| 46 |
|
| 47 |
+
# Transform data dari wide format ke long format untuk visualisasi
|
| 48 |
df_long = df_predicted.melt(
|
| 49 |
value_vars=aspek_columns,
|
| 50 |
var_name="aspek",
|
| 51 |
value_name="sentimen"
|
| 52 |
)
|
| 53 |
+
|
| 54 |
+
# Konversi sentimen ke categorical untuk sorting yang konsisten
|
| 55 |
df_long["sentimen"] = pd.Categorical(
|
| 56 |
df_long["sentimen"],
|
| 57 |
categories=category_order,
|
| 58 |
ordered=True
|
| 59 |
)
|
| 60 |
+
|
| 61 |
+
# Hitung jumlah setiap kombinasi aspek-sentimen
|
| 62 |
count_data = df_long.groupby(
|
| 63 |
["aspek", "sentimen"], observed=False
|
| 64 |
).size().reset_index(name="jumlah")
|
| 65 |
+
|
| 66 |
+
# Buat bar chart dengan Plotly
|
| 67 |
fig = px.bar(
|
| 68 |
count_data,
|
| 69 |
x="aspek",
|
| 70 |
y="jumlah",
|
| 71 |
color="sentimen",
|
| 72 |
+
barmode="group", # Bar dikelompokkan berdampingan
|
| 73 |
color_discrete_map=sentimen_palette,
|
| 74 |
category_orders={"sentimen": category_order}
|
| 75 |
)
|
| 76 |
fig.update_layout(title="Distribusi Sentimen per Aspek")
|
| 77 |
+
|
| 78 |
+
# Tampilkan chart di Streamlit
|
| 79 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 80 |
|
| 81 |
|
| 82 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 83 |
+
"""
|
| 84 |
+
Menampilkan pie chart distribusi total sentimen dari semua aspek.
|
| 85 |
+
Chart menampilkan proporsi keseluruhan sentimen dalam bentuk donut chart.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
|
| 89 |
+
aspek_columns (list): List nama kolom aspek
|
| 90 |
+
"""
|
| 91 |
+
# Flatten semua nilai sentimen dari semua aspek menjadi satu array
|
| 92 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
| 93 |
+
|
| 94 |
+
# Hitung frekuensi setiap sentimen
|
| 95 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 96 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 97 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
| 98 |
+
|
| 99 |
+
# Buat pie chart (donut chart dengan hole=0.3)
|
| 100 |
+
fig = px.pie(
|
| 101 |
+
sentimen_counts,
|
| 102 |
+
names="sentimen",
|
| 103 |
+
values="jumlah",
|
| 104 |
+
color="sentimen",
|
| 105 |
+
color_discrete_map=sentimen_palette,
|
| 106 |
+
hole=0.3 # Buat donut chart
|
| 107 |
+
)
|
| 108 |
fig.update_layout(title="Total Komposisi Sentimen")
|
| 109 |
+
|
| 110 |
+
# Tampilkan persentase dan label di dalam chart
|
| 111 |
fig.update_traces(textposition='inside', textinfo='percent+label')
|
| 112 |
+
|
| 113 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 114 |
|
| 115 |
|
| 116 |
def show_year_distribution(df):
|
| 117 |
+
"""
|
| 118 |
+
Menampilkan distribusi jumlah kritik/saran per tahun.
|
| 119 |
+
Jika kolom 'tahun' tidak ada, akan mencoba ekstrak dari kolom 'tanggal'.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
df (pd.DataFrame): DataFrame input
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 126 |
+
"""
|
| 127 |
+
# Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
|
| 128 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 129 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 130 |
|
| 131 |
+
# Validasi: return None jika tidak ada kolom tahun
|
| 132 |
if 'tahun' not in df.columns:
|
| 133 |
+
return None
|
| 134 |
|
| 135 |
+
# Filter data yang memiliki nilai tahun valid
|
| 136 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 137 |
if df_tahun.empty:
|
| 138 |
return None
|
| 139 |
|
| 140 |
+
# Konversi tahun ke integer
|
| 141 |
df_tahun['tahun'] = df_tahun['tahun'].astype(int)
|
| 142 |
+
|
| 143 |
+
# Hitung frekuensi per tahun
|
| 144 |
year_counts = df_tahun['tahun'].value_counts().reset_index()
|
| 145 |
year_counts.columns = ['tahun', 'jumlah']
|
| 146 |
year_counts = year_counts.sort_values('jumlah', ascending=False)
|
| 147 |
|
| 148 |
+
# Buat bar chart
|
| 149 |
+
fig = px.bar(
|
| 150 |
+
year_counts,
|
| 151 |
+
x='tahun',
|
| 152 |
+
y='jumlah',
|
| 153 |
+
color='tahun',
|
| 154 |
+
title="Distribusi Kritik/Saran per Tahun"
|
| 155 |
+
)
|
| 156 |
+
# Treat tahun sebagai kategori untuk menghindari interpolasi
|
| 157 |
fig.update_layout(xaxis=dict(type='category'))
|
| 158 |
+
|
| 159 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 160 |
return True
|
| 161 |
|
| 162 |
|
| 163 |
def show_semester_distribution(df):
|
| 164 |
+
"""
|
| 165 |
+
Menampilkan distribusi jumlah kritik/saran per semester.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
df (pd.DataFrame): DataFrame input
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 172 |
+
"""
|
| 173 |
+
# Validasi: cek apakah kolom semester ada
|
| 174 |
if 'semester' not in df.columns:
|
| 175 |
return None
|
| 176 |
|
| 177 |
+
# Hitung frekuensi per semester
|
| 178 |
semester_counts = df['semester'].value_counts().reset_index()
|
| 179 |
semester_counts.columns = ['semester', 'jumlah']
|
| 180 |
semester_counts = semester_counts.sort_values('jumlah', ascending=False)
|
| 181 |
+
|
| 182 |
+
# Buat bar chart
|
| 183 |
+
fig = px.bar(
|
| 184 |
+
semester_counts,
|
| 185 |
+
x='semester',
|
| 186 |
+
y='jumlah',
|
| 187 |
+
color='semester',
|
| 188 |
+
title="Distribusi Kritik/Saran per Semester"
|
| 189 |
+
)
|
| 190 |
+
# Sort berdasarkan total descending
|
| 191 |
fig.update_layout(xaxis=dict(categoryorder='total descending'))
|
| 192 |
+
|
| 193 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 194 |
return True
|
| 195 |
|
| 196 |
|
| 197 |
def show_prodi_distribution(df):
|
| 198 |
+
"""
|
| 199 |
+
Menampilkan jumlah kritik/saran per program studi dalam bentuk horizontal bar chart.
|
| 200 |
+
|
| 201 |
+
Args:
|
| 202 |
+
df (pd.DataFrame): DataFrame input
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 206 |
+
"""
|
| 207 |
+
# Validasi: cek apakah kolom nama_prodi ada
|
| 208 |
if 'nama_prodi' not in df.columns:
|
| 209 |
return None
|
| 210 |
|
| 211 |
+
# Hitung frekuensi per program studi
|
| 212 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 213 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
| 214 |
+
|
| 215 |
+
# Sort ascending untuk horizontal bar (terbanyak di atas)
|
| 216 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
| 217 |
+
|
| 218 |
+
# Buat horizontal bar chart
|
| 219 |
fig = px.bar(
|
| 220 |
prodi_counts,
|
| 221 |
x='jumlah',
|
| 222 |
y='nama_prodi',
|
| 223 |
+
orientation='h', # Horizontal orientation
|
| 224 |
color='jumlah',
|
| 225 |
title="Jumlah Kritik/Saran per Program Studi"
|
| 226 |
)
|
| 227 |
+
|
| 228 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 229 |
return True
|
| 230 |
|
| 231 |
|
| 232 |
def show_top10_matkul_distribution(df):
|
| 233 |
+
"""
|
| 234 |
+
Menampilkan 10 mata kuliah dengan jumlah kritik/saran terbanyak.
|
| 235 |
+
Format: [kode_matakuliah] - [nama_matakuliah]
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
df (pd.DataFrame): DataFrame input
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 242 |
+
"""
|
| 243 |
+
# Validasi: cek apakah kolom yang diperlukan ada
|
| 244 |
required_cols = ['nama_matakuliah', 'kode_matakuliah']
|
| 245 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 246 |
|
| 247 |
if missing_cols:
|
| 248 |
return None
|
| 249 |
|
| 250 |
+
# Group by kode dan nama mata kuliah, ambil 10 teratas
|
| 251 |
matkul_counts = (
|
| 252 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 253 |
.size()
|
|
|
|
| 255 |
.sort_values(by='jumlah', ascending=False)
|
| 256 |
.head(10)
|
| 257 |
)
|
| 258 |
+
|
| 259 |
+
# Buat label gabungan: "kode - nama"
|
| 260 |
matkul_counts['label'] = (
|
| 261 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 262 |
matkul_counts['nama_matakuliah']
|
| 263 |
)
|
| 264 |
+
|
| 265 |
+
# Sort ascending untuk horizontal bar (terbanyak di atas)
|
| 266 |
matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
|
| 267 |
|
| 268 |
+
# Buat horizontal bar chart
|
| 269 |
fig = px.bar(
|
| 270 |
matkul_counts,
|
| 271 |
x='jumlah',
|
|
|
|
| 274 |
title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
|
| 275 |
color='jumlah'
|
| 276 |
)
|
| 277 |
+
|
| 278 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 279 |
return True
|
| 280 |
|
| 281 |
|
| 282 |
def show_sentiment_by_year(df, aspek_columns):
|
| 283 |
+
"""
|
| 284 |
+
Menampilkan distribusi sentimen per tahun dalam bentuk grouped bar chart.
|
| 285 |
+
Menunjukkan bagaimana sentimen berubah dari tahun ke tahun.
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
df (pd.DataFrame): DataFrame input
|
| 289 |
+
aspek_columns (list): List nama kolom aspek
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 293 |
+
"""
|
| 294 |
+
# Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
|
| 295 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 296 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 297 |
|
| 298 |
+
# Validasi: return None jika tidak ada kolom tahun
|
| 299 |
if 'tahun' not in df.columns:
|
| 300 |
return None
|
| 301 |
|
| 302 |
+
# Transform data dari wide ke long format, keep tahun sebagai ID variable
|
| 303 |
+
df_long = df.melt(
|
| 304 |
+
id_vars=['tahun'],
|
| 305 |
+
value_vars=aspek_columns,
|
| 306 |
+
var_name='aspek',
|
| 307 |
+
value_name='sentimen'
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Group by tahun dan sentimen, hitung frekuensi
|
| 311 |
year_sentiment = df_long.groupby(
|
| 312 |
['tahun', 'sentimen'], observed=False
|
| 313 |
).size().reset_index(name='jumlah')
|
| 314 |
+
|
| 315 |
year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
|
| 316 |
+
|
| 317 |
+
# Buat grouped bar chart
|
| 318 |
+
fig = px.bar(
|
| 319 |
+
year_sentiment,
|
| 320 |
+
x='tahun',
|
| 321 |
+
y='jumlah',
|
| 322 |
+
color='sentimen',
|
| 323 |
+
barmode='group', # Bars dikelompokkan per tahun
|
| 324 |
+
color_discrete_map=sentimen_palette
|
| 325 |
+
)
|
| 326 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
|
| 327 |
+
|
| 328 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 329 |
return True
|
| 330 |
|
| 331 |
|
| 332 |
def show_sentiment_by_semester(df, aspek_columns):
|
| 333 |
+
"""
|
| 334 |
+
Menampilkan distribusi sentimen per semester dalam bentuk grouped bar chart.
|
| 335 |
+
|
| 336 |
+
Args:
|
| 337 |
+
df (pd.DataFrame): DataFrame input
|
| 338 |
+
aspek_columns (list): List nama kolom aspek
|
| 339 |
+
|
| 340 |
+
Returns:
|
| 341 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 342 |
+
"""
|
| 343 |
+
# Validasi: cek apakah kolom semester ada
|
| 344 |
if 'semester' not in df.columns:
|
| 345 |
return None
|
| 346 |
|
| 347 |
+
# Transform data dari wide ke long format, keep semester sebagai ID variable
|
| 348 |
+
df_long = df.melt(
|
| 349 |
+
id_vars=['semester'],
|
| 350 |
+
value_vars=aspek_columns,
|
| 351 |
+
var_name='aspek',
|
| 352 |
+
value_name='sentimen'
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
# Group by semester dan sentimen, hitung frekuensi
|
| 356 |
semester_sentiment = df_long.groupby(
|
| 357 |
['semester', 'sentimen'], observed=False
|
| 358 |
).size().reset_index(name='jumlah')
|
| 359 |
+
|
| 360 |
semester_sentiment = semester_sentiment.sort_values(
|
| 361 |
'jumlah', ascending=False)
|
| 362 |
+
|
| 363 |
+
# Buat grouped bar chart
|
| 364 |
+
fig = px.bar(
|
| 365 |
+
semester_sentiment,
|
| 366 |
+
x='semester',
|
| 367 |
+
y='jumlah',
|
| 368 |
+
color='sentimen',
|
| 369 |
+
barmode='group', # Bars dikelompokkan per semester
|
| 370 |
+
color_discrete_map=sentimen_palette
|
| 371 |
+
)
|
| 372 |
fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
|
| 373 |
+
|
| 374 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 375 |
return True
|
| 376 |
|
| 377 |
|
| 378 |
def show_sentiment_by_prodi(df, aspek_columns):
|
| 379 |
+
"""
|
| 380 |
+
Menampilkan distribusi sentimen per program studi dalam horizontal grouped bar chart.
|
| 381 |
+
Program studi diurutkan berdasarkan total jumlah kritik/saran.
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
df (pd.DataFrame): DataFrame input
|
| 385 |
+
aspek_columns (list): List nama kolom aspek
|
| 386 |
+
|
| 387 |
+
Returns:
|
| 388 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 389 |
+
"""
|
| 390 |
+
# Validasi: cek apakah kolom nama_prodi ada
|
| 391 |
if 'nama_prodi' not in df.columns:
|
| 392 |
return None
|
| 393 |
|
| 394 |
+
# Transform data dari wide ke long format
|
| 395 |
df_long = df.melt(
|
| 396 |
id_vars=['nama_prodi'],
|
| 397 |
value_vars=aspek_columns,
|
|
|
|
| 399 |
value_name='sentimen'
|
| 400 |
)
|
| 401 |
|
| 402 |
+
# Group by prodi dan sentimen, hitung frekuensi
|
| 403 |
prodi_sentiment = (
|
| 404 |
df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
|
| 405 |
.size()
|
| 406 |
.reset_index(name='jumlah')
|
| 407 |
)
|
| 408 |
|
| 409 |
+
# Hitung total per prodi untuk sorting
|
| 410 |
total_per_prodi = (
|
| 411 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 412 |
.sum()
|
| 413 |
.sort_values(ascending=False)
|
| 414 |
)
|
| 415 |
+
|
| 416 |
+
# Reverse order untuk horizontal bar (terbanyak di atas)
|
| 417 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 418 |
|
| 419 |
+
# Konversi ke categorical untuk maintain order
|
| 420 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 421 |
prodi_sentiment['nama_prodi'],
|
| 422 |
categories=ordered_categories,
|
| 423 |
ordered=True
|
| 424 |
)
|
| 425 |
|
| 426 |
+
# Buat horizontal grouped bar chart
|
| 427 |
fig = px.bar(
|
| 428 |
prodi_sentiment,
|
| 429 |
y='nama_prodi',
|
| 430 |
x='jumlah',
|
| 431 |
color='sentimen',
|
| 432 |
barmode='group',
|
| 433 |
+
orientation='h', # Horizontal orientation
|
| 434 |
color_discrete_map=sentimen_palette
|
| 435 |
)
|
| 436 |
fig.update_layout(
|
| 437 |
title="Distribusi Sentimen per Program Studi",
|
| 438 |
+
yaxis={
|
| 439 |
+
'categoryorder': 'array',
|
| 440 |
+
'categoryarray': ordered_categories
|
| 441 |
+
}
|
| 442 |
)
|
| 443 |
+
|
| 444 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 445 |
return True
|
| 446 |
|
| 447 |
|
| 448 |
def show_sentiment_by_top10_matkul(df, aspek_columns):
|
| 449 |
+
"""
|
| 450 |
+
Menampilkan distribusi sentimen pada 10 mata kuliah dengan kritik/saran terbanyak.
|
| 451 |
+
Chart menggunakan horizontal grouped bar, diurutkan berdasarkan total kritik/saran.
|
| 452 |
+
|
| 453 |
+
Args:
|
| 454 |
+
df (pd.DataFrame): DataFrame input
|
| 455 |
+
aspek_columns (list): List nama kolom aspek
|
| 456 |
+
|
| 457 |
+
Returns:
|
| 458 |
+
bool/None: True jika berhasil, None jika kolom tidak tersedia
|
| 459 |
+
"""
|
| 460 |
+
# Validasi: cek apakah kolom yang diperlukan ada
|
| 461 |
required_cols = ['kode_matakuliah', 'nama_matakuliah']
|
| 462 |
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 463 |
|
| 464 |
if missing_cols:
|
| 465 |
return None
|
| 466 |
|
| 467 |
+
# Identifikasi top 10 mata kuliah berdasarkan jumlah kritik/saran
|
| 468 |
df_top10 = (
|
| 469 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 470 |
.size()
|
|
|
|
| 473 |
.index
|
| 474 |
)
|
| 475 |
|
| 476 |
+
# Filter data hanya untuk top 10 mata kuliah
|
| 477 |
df_filtered = df[df.set_index(
|
| 478 |
['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
|
| 479 |
|
| 480 |
+
# Transform data dari wide ke long format
|
| 481 |
df_long = df_filtered.melt(
|
| 482 |
id_vars=['kode_matakuliah', 'nama_matakuliah'],
|
| 483 |
value_vars=aspek_columns,
|
|
|
|
| 485 |
value_name='sentimen'
|
| 486 |
)
|
| 487 |
|
| 488 |
+
# Buat label gabungan: "kode - nama"
|
| 489 |
df_long['label'] = (
|
| 490 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 491 |
)
|
| 492 |
|
| 493 |
+
# Group by label dan sentimen, hitung frekuensi
|
| 494 |
matkul_sentiment = (
|
| 495 |
df_long.groupby(['label', 'sentimen'], observed=False)
|
| 496 |
.size()
|
| 497 |
.reset_index(name='jumlah')
|
| 498 |
)
|
| 499 |
|
| 500 |
+
# Hitung total per label untuk sorting
|
| 501 |
total_per_label = (
|
| 502 |
matkul_sentiment.groupby('label')['jumlah']
|
| 503 |
.sum()
|
| 504 |
.sort_values(ascending=False)
|
| 505 |
)
|
| 506 |
+
|
| 507 |
+
# Reverse order untuk horizontal bar (terbanyak di atas)
|
| 508 |
ordered_labels = total_per_label.index.tolist()[::-1]
|
| 509 |
|
| 510 |
+
# Konversi ke categorical untuk maintain order
|
| 511 |
matkul_sentiment['label'] = pd.Categorical(
|
| 512 |
matkul_sentiment['label'],
|
| 513 |
categories=ordered_labels,
|
| 514 |
ordered=True
|
| 515 |
)
|
| 516 |
|
| 517 |
+
# Buat horizontal grouped bar chart
|
| 518 |
fig = px.bar(
|
| 519 |
matkul_sentiment,
|
| 520 |
y='label',
|
|
|
|
| 526 |
)
|
| 527 |
fig.update_layout(
|
| 528 |
title="Distribusi Sentimen pada Top 10 Mata Kuliah",
|
| 529 |
+
yaxis={
|
| 530 |
+
'categoryorder': 'array',
|
| 531 |
+
'categoryarray': ordered_labels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
}
|
| 533 |
)
|
| 534 |
+
|
|
|
|
|
|
|
|
|
|
| 535 |
st.plotly_chart(fig, use_container_width=True, config=config_options)
|
| 536 |
+
return True
|