Spaces:
Running
Running
add some comments
Browse files- app.py +87 -38
- visualization.py +22 -4
app.py
CHANGED
|
@@ -34,11 +34,12 @@ from visualization import (
|
|
| 34 |
)
|
| 35 |
from preprocessing import text_preprocessing_pipeline
|
| 36 |
|
| 37 |
-
# Konfigurasi untuk chunked processing
|
| 38 |
CHUNK_SIZE = 2500
|
| 39 |
ENABLE_CHUNKED = True
|
| 40 |
CACHE_EXPIRY_HOURS = 24
|
| 41 |
|
|
|
|
| 42 |
os.makedirs("chache_file", exist_ok=True)
|
| 43 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 44 |
|
|
@@ -56,27 +57,30 @@ st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/fon
|
|
| 56 |
|
| 57 |
|
| 58 |
def get_session_id():
|
| 59 |
-
"""Generate atau
|
| 60 |
query_params = st.query_params
|
| 61 |
|
|
|
|
| 62 |
if "sid" in query_params:
|
| 63 |
sid = query_params["sid"]
|
| 64 |
st.session_state.session_id = sid
|
| 65 |
return sid
|
| 66 |
|
|
|
|
| 67 |
if "session_id" not in st.session_state:
|
| 68 |
new_session_id = str(uuid.uuid4())
|
| 69 |
st.session_state.session_id = new_session_id
|
| 70 |
st.query_params["sid"] = new_session_id
|
| 71 |
return new_session_id
|
| 72 |
|
|
|
|
| 73 |
existing_id = st.session_state.session_id
|
| 74 |
st.query_params["sid"] = existing_id
|
| 75 |
return existing_id
|
| 76 |
|
| 77 |
|
| 78 |
def get_session_cache_dir():
|
| 79 |
-
"""
|
| 80 |
sid = get_session_id()
|
| 81 |
cache_dir = Path(f"chache_file/sessions/{sid}")
|
| 82 |
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -84,14 +88,14 @@ def get_session_cache_dir():
|
|
| 84 |
|
| 85 |
|
| 86 |
def get_session_chunks_dir():
|
| 87 |
-
"""
|
| 88 |
chunks_dir = get_session_cache_dir() / "chunks"
|
| 89 |
chunks_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
return chunks_dir
|
| 91 |
|
| 92 |
|
| 93 |
def cleanup_old_sessions():
|
| 94 |
-
"""Hapus session
|
| 95 |
sessions_dir = Path("chache_file/sessions")
|
| 96 |
if not sessions_dir.exists():
|
| 97 |
return
|
|
@@ -102,6 +106,7 @@ def cleanup_old_sessions():
|
|
| 102 |
mod_time = session_dir.stat().st_mtime
|
| 103 |
age_hours = (current_time - mod_time) / 3600
|
| 104 |
|
|
|
|
| 105 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 106 |
try:
|
| 107 |
shutil.rmtree(session_dir)
|
|
@@ -110,18 +115,21 @@ def cleanup_old_sessions():
|
|
| 110 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 111 |
|
| 112 |
|
|
|
|
| 113 |
cleanup_old_sessions()
|
| 114 |
|
| 115 |
|
| 116 |
@st.cache_resource(show_spinner=False)
|
| 117 |
def get_model_resources():
|
| 118 |
-
"""Memuat model dan tokenizer IndoBERT
|
| 119 |
return load_model_and_tokenizer()
|
| 120 |
|
| 121 |
|
|
|
|
| 122 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 123 |
model, tokenizer, le, device = get_model_resources()
|
| 124 |
|
|
|
|
| 125 |
success_placeholder = st.empty()
|
| 126 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 127 |
time.sleep(1)
|
|
@@ -129,7 +137,7 @@ success_placeholder.empty()
|
|
| 129 |
|
| 130 |
|
| 131 |
def convert_df_to_excel(df):
|
| 132 |
-
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream
|
| 133 |
output = BytesIO()
|
| 134 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 135 |
df.to_excel(writer, index=False)
|
|
@@ -137,7 +145,7 @@ def convert_df_to_excel(df):
|
|
| 137 |
|
| 138 |
|
| 139 |
def clear_memory():
|
| 140 |
-
"""
|
| 141 |
gc.collect()
|
| 142 |
if torch.cuda.is_available():
|
| 143 |
torch.cuda.empty_cache()
|
|
@@ -146,9 +154,9 @@ def clear_memory():
|
|
| 146 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 147 |
"""
|
| 148 |
Memproses satu chunk data dengan batch processing.
|
| 149 |
-
Progress bar: Preprocessing 0-100%, lalu Predicting 0-100%
|
| 150 |
"""
|
| 151 |
-
# STEP 1: Preprocessing (0-100%)
|
| 152 |
cleaned_text_list = []
|
| 153 |
total_rows = len(chunk_dataframe)
|
| 154 |
|
|
@@ -156,6 +164,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 156 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 157 |
cleaned_text_list.append(clean_text)
|
| 158 |
|
|
|
|
| 159 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 160 |
progress = (idx + 1) / total_rows
|
| 161 |
progress_bar.progress(progress)
|
|
@@ -168,11 +177,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 168 |
f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
|
| 169 |
time.sleep(0.2)
|
| 170 |
|
| 171 |
-
# STEP 2: Batch Prediction (0-100%)
|
| 172 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 173 |
num_sents = len(cleaned_text_list)
|
| 174 |
num_asps = len(ASPEK_COLUMNS)
|
| 175 |
|
|
|
|
| 176 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 177 |
tokenizer, CONFIG["max_len"])
|
| 178 |
dl = DataLoader(
|
|
@@ -182,11 +192,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 182 |
num_workers=0
|
| 183 |
)
|
| 184 |
|
|
|
|
| 185 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 186 |
|
| 187 |
batch_counter = 0
|
| 188 |
total_batch_count = len(dl)
|
| 189 |
|
|
|
|
| 190 |
model.eval()
|
| 191 |
with torch.no_grad():
|
| 192 |
for batch_data in dl:
|
|
@@ -195,22 +207,25 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 195 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 196 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 197 |
|
|
|
|
| 198 |
model_outputs = model(inp_ids, attn_mask)
|
| 199 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 200 |
predicted_indices = torch.argmax(
|
| 201 |
probabilities, dim=1).cpu().numpy()
|
| 202 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 203 |
|
|
|
|
| 204 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 205 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 206 |
|
|
|
|
| 207 |
batch_counter += 1
|
| 208 |
progress = batch_counter / total_batch_count
|
| 209 |
progress_bar.progress(progress)
|
| 210 |
status_text.text(
|
| 211 |
f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
|
| 212 |
|
| 213 |
-
# STEP 3:
|
| 214 |
result_list = []
|
| 215 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 216 |
row_dict = data_row.to_dict()
|
|
@@ -221,11 +236,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 221 |
|
| 222 |
result_dataframe = pd.DataFrame(result_list)
|
| 223 |
|
|
|
|
| 224 |
chunks_directory = get_session_chunks_dir()
|
| 225 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 226 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
| 227 |
|
| 228 |
-
#
|
| 229 |
progress_bar.progress(1.0)
|
| 230 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 231 |
|
|
@@ -235,7 +251,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
|
|
| 235 |
|
| 236 |
|
| 237 |
def get_available_columns(df):
|
| 238 |
-
"""Deteksi kolom-kolom yang tersedia dalam dataframe"""
|
| 239 |
available = {
|
| 240 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 241 |
'has_semester': 'semester' in df.columns,
|
|
@@ -256,7 +272,7 @@ st.markdown(" ")
|
|
| 256 |
st.markdown(" ")
|
| 257 |
st.markdown(" ")
|
| 258 |
|
| 259 |
-
# Panduan pengunaan
|
| 260 |
steps = [
|
| 261 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 262 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
@@ -283,18 +299,19 @@ for i, step in enumerate(steps):
|
|
| 283 |
st.markdown("")
|
| 284 |
st.markdown("")
|
| 285 |
|
| 286 |
-
# Upload file
|
| 287 |
uploaded_file = st.file_uploader(
|
| 288 |
" Upload Data Kritik & Saran",
|
| 289 |
type=["xlsx"],
|
| 290 |
help="File maksimal 200MB dengan format .xlsx"
|
| 291 |
)
|
| 292 |
|
| 293 |
-
#
|
| 294 |
session_cache_dir = get_session_cache_dir()
|
| 295 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 296 |
session_chunks_dir = get_session_chunks_dir()
|
| 297 |
|
|
|
|
| 298 |
if session_result_file.exists():
|
| 299 |
if st.button("Hapus Cache Data"):
|
| 300 |
session_result_file.unlink()
|
|
@@ -302,6 +319,7 @@ if session_result_file.exists():
|
|
| 302 |
time.sleep(1)
|
| 303 |
st.rerun()
|
| 304 |
|
|
|
|
| 305 |
if session_chunks_dir.exists():
|
| 306 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 307 |
if chunk_files:
|
|
@@ -313,6 +331,7 @@ if session_chunks_dir.exists():
|
|
| 313 |
time.sleep(1)
|
| 314 |
st.rerun()
|
| 315 |
|
|
|
|
| 316 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 317 |
if not uploaded_file:
|
| 318 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
@@ -334,9 +353,11 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
|
|
| 334 |
st.caption(" ")
|
| 335 |
|
| 336 |
|
|
|
|
| 337 |
if "df_predicted" not in st.session_state:
|
| 338 |
st.session_state.df_predicted = None
|
| 339 |
|
|
|
|
| 340 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 341 |
try:
|
| 342 |
df_cached = pd.read_csv(session_result_file)
|
|
@@ -349,14 +370,17 @@ if st.session_state.df_predicted is None and session_result_file.exists():
|
|
| 349 |
st.warning(f"Gagal memuat cache: {e}")
|
| 350 |
|
| 351 |
|
|
|
|
| 352 |
if uploaded_file:
|
| 353 |
file_bytes = uploaded_file.getvalue()
|
|
|
|
| 354 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 355 |
st.session_state.last_uploaded_file = file_bytes
|
| 356 |
st.session_state.uploaded_filename = uploaded_file.name
|
| 357 |
try:
|
| 358 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 359 |
|
|
|
|
| 360 |
if "tahun" in df_uploaded.columns:
|
| 361 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 362 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
@@ -364,11 +388,14 @@ if uploaded_file:
|
|
| 364 |
except ValueError as err:
|
| 365 |
st.error(f"Gagal membaca file: {err}")
|
| 366 |
else:
|
|
|
|
| 367 |
if "kritik_saran" not in df_uploaded.columns:
|
| 368 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 369 |
else:
|
|
|
|
| 370 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 371 |
subset=["kritik_saran"])
|
|
|
|
| 372 |
for aspect_col in ASPEK_COLUMNS:
|
| 373 |
if aspect_col not in df_uploaded.columns:
|
| 374 |
df_uploaded[aspect_col] = None
|
|
@@ -376,9 +403,11 @@ if uploaded_file:
|
|
| 376 |
st.markdown("### Preprocessing dan Prediksi")
|
| 377 |
|
| 378 |
total_rows = len(df_uploaded)
|
|
|
|
| 379 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 380 |
|
| 381 |
if use_chunked:
|
|
|
|
| 382 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 383 |
|
| 384 |
info_col1, info_col2, info_col3 = st.columns(3)
|
|
@@ -397,6 +426,7 @@ if uploaded_file:
|
|
| 397 |
chunk_status_text = st.empty()
|
| 398 |
overall_status = st.empty()
|
| 399 |
|
|
|
|
| 400 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 401 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 402 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
@@ -405,6 +435,7 @@ if uploaded_file:
|
|
| 405 |
current_chunk_file = session_chunks_dir / \
|
| 406 |
f"chunk_{current_chunk_number}.csv"
|
| 407 |
|
|
|
|
| 408 |
if current_chunk_file.exists():
|
| 409 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 410 |
all_chunk_results.append(chunk_result)
|
|
@@ -423,6 +454,7 @@ if uploaded_file:
|
|
| 423 |
time.sleep(0.3)
|
| 424 |
continue
|
| 425 |
|
|
|
|
| 426 |
chunk_progress_bar.progress(0)
|
| 427 |
|
| 428 |
chunk_result = process_chunk_batch(
|
|
@@ -431,6 +463,7 @@ if uploaded_file:
|
|
| 431 |
)
|
| 432 |
all_chunk_results.append(chunk_result)
|
| 433 |
|
|
|
|
| 434 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 435 |
progress_pct = (processed / total_rows) * 100
|
| 436 |
elapsed = time.time() - start_time
|
|
@@ -445,6 +478,7 @@ if uploaded_file:
|
|
| 445 |
|
| 446 |
time.sleep(0.3)
|
| 447 |
|
|
|
|
| 448 |
chunk_status_text.empty()
|
| 449 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 450 |
df_session = pd.concat(
|
|
@@ -455,6 +489,7 @@ if uploaded_file:
|
|
| 455 |
duration = end_time - start_time
|
| 456 |
|
| 457 |
else:
|
|
|
|
| 458 |
st.info(
|
| 459 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
| 460 |
|
|
@@ -463,6 +498,7 @@ if uploaded_file:
|
|
| 463 |
progress_bar = st.progress(0)
|
| 464 |
status_text = st.empty()
|
| 465 |
|
|
|
|
| 466 |
cleaned_text_list = []
|
| 467 |
total_preprocessing = len(df_uploaded)
|
| 468 |
|
|
@@ -476,6 +512,7 @@ if uploaded_file:
|
|
| 476 |
status_text.text(
|
| 477 |
f"Preprocessing: {idx+1}/{total_preprocessing} rows")
|
| 478 |
|
|
|
|
| 479 |
progress_bar.progress(0)
|
| 480 |
status_text.text("Memulai prediksi...")
|
| 481 |
time.sleep(0.3)
|
|
@@ -519,6 +556,7 @@ if uploaded_file:
|
|
| 519 |
status_text.text(
|
| 520 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 521 |
|
|
|
|
| 522 |
result_list = []
|
| 523 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 524 |
row_dict = data_row.to_dict()
|
|
@@ -538,16 +576,20 @@ if uploaded_file:
|
|
| 538 |
end_time = time.time()
|
| 539 |
duration = end_time - start_time
|
| 540 |
|
|
|
|
| 541 |
st.session_state.df_predicted = df_session
|
| 542 |
df_session.to_csv(session_result_file, index=False)
|
| 543 |
|
|
|
|
| 544 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 545 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 546 |
f.write(uploaded_file.name)
|
| 547 |
|
|
|
|
| 548 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 549 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 550 |
|
|
|
|
| 551 |
if use_chunked:
|
| 552 |
st.success(
|
| 553 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
@@ -567,11 +609,11 @@ if uploaded_file:
|
|
| 567 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 568 |
)
|
| 569 |
|
| 570 |
-
#
|
| 571 |
if st.session_state.df_predicted is not None:
|
| 572 |
df_predicted = st.session_state.df_predicted
|
| 573 |
|
| 574 |
-
# Deteksi kolom yang tersedia
|
| 575 |
available_cols = get_available_columns(df_predicted)
|
| 576 |
|
| 577 |
# Sidebar filter dengan pengecekan kolom dinamis
|
|
@@ -586,7 +628,7 @@ if st.session_state.df_predicted is not None:
|
|
| 586 |
st.sidebar.info(
|
| 587 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 588 |
|
| 589 |
-
# Filter Mata Kuliah
|
| 590 |
selected_matkul = []
|
| 591 |
if available_cols['has_matkul']:
|
| 592 |
matkul_options = sorted(
|
|
@@ -595,7 +637,7 @@ if st.session_state.df_predicted is not None:
|
|
| 595 |
selected_matkul = st.sidebar.multiselect(
|
| 596 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 597 |
|
| 598 |
-
# Filter Program Studi
|
| 599 |
selected_prodi = []
|
| 600 |
if available_cols['has_prodi']:
|
| 601 |
prodi_options = sorted(
|
|
@@ -604,7 +646,7 @@ if st.session_state.df_predicted is not None:
|
|
| 604 |
selected_prodi = st.sidebar.multiselect(
|
| 605 |
"Program Studi", prodi_options, default=prodi_options)
|
| 606 |
|
| 607 |
-
# Filter Tahun
|
| 608 |
selected_tahun = []
|
| 609 |
if available_cols['has_tahun']:
|
| 610 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
|
@@ -617,7 +659,7 @@ if st.session_state.df_predicted is not None:
|
|
| 617 |
selected_tahun = st.sidebar.multiselect(
|
| 618 |
"Tahun", tahun_options, default=tahun_options)
|
| 619 |
|
| 620 |
-
# Filter Semester
|
| 621 |
selected_semester = []
|
| 622 |
if available_cols['has_semester']:
|
| 623 |
semester_options = sorted(
|
|
@@ -626,7 +668,7 @@ if st.session_state.df_predicted is not None:
|
|
| 626 |
selected_semester = st.sidebar.multiselect(
|
| 627 |
"Semester", semester_options, default=semester_options)
|
| 628 |
|
| 629 |
-
#
|
| 630 |
df_filtered = df_clean.copy()
|
| 631 |
|
| 632 |
if selected_matkul and available_cols['has_matkul']:
|
|
@@ -648,7 +690,7 @@ if st.session_state.df_predicted is not None:
|
|
| 648 |
st.markdown("### Tabel Data Hasil Prediksi")
|
| 649 |
st.dataframe(df_filtered, width='stretch')
|
| 650 |
|
| 651 |
-
#
|
| 652 |
col_dl1, col_dl2 = st.columns(2)
|
| 653 |
with col_dl1:
|
| 654 |
st.download_button(
|
|
@@ -677,17 +719,18 @@ if st.session_state.df_predicted is not None:
|
|
| 677 |
st.markdown("### Ringkasan Cepat")
|
| 678 |
st.markdown("")
|
| 679 |
|
|
|
|
| 680 |
total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
|
| 681 |
total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
|
| 682 |
total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
|
| 683 |
|
| 684 |
-
#
|
| 685 |
summary_cols = []
|
| 686 |
|
| 687 |
# Kolom dasar (selalu ada)
|
| 688 |
summary_cols.extend(['ulasan', 'aspek'])
|
| 689 |
|
| 690 |
-
# Kolom opsional
|
| 691 |
if available_cols['has_matkul']:
|
| 692 |
summary_cols.append('matkul')
|
| 693 |
if available_cols['has_prodi']:
|
|
@@ -695,31 +738,31 @@ if st.session_state.df_predicted is not None:
|
|
| 695 |
if available_cols['has_semester']:
|
| 696 |
summary_cols.append('semester')
|
| 697 |
|
| 698 |
-
# Buat kolom dinamis
|
| 699 |
num_cols = len(summary_cols)
|
| 700 |
cols = st.columns(num_cols)
|
| 701 |
|
| 702 |
col_idx = 0
|
| 703 |
|
| 704 |
-
# Ulasan & Aspek
|
| 705 |
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
|
| 706 |
col_idx += 1
|
| 707 |
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 708 |
col_idx += 1
|
| 709 |
|
| 710 |
-
# Mata Kuliah (jika
|
| 711 |
if available_cols['has_matkul']:
|
| 712 |
matkul_count = df_filtered['nama_matakuliah'].nunique()
|
| 713 |
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
|
| 714 |
col_idx += 1
|
| 715 |
|
| 716 |
-
# Prodi (jika
|
| 717 |
if available_cols['has_prodi']:
|
| 718 |
prodi_count = df_filtered['nama_prodi'].nunique()
|
| 719 |
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 720 |
col_idx += 1
|
| 721 |
|
| 722 |
-
# Semester (jika
|
| 723 |
if available_cols['has_semester']:
|
| 724 |
semester_count = df_filtered['semester'].nunique()
|
| 725 |
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
|
|
@@ -727,7 +770,7 @@ if st.session_state.df_predicted is not None:
|
|
| 727 |
|
| 728 |
st.markdown("")
|
| 729 |
|
| 730 |
-
# Baris kedua: Sentimen
|
| 731 |
summary_cols2 = ['positif', 'netral', 'negatif']
|
| 732 |
|
| 733 |
if available_cols['has_tahun']:
|
|
@@ -738,6 +781,7 @@ if st.session_state.df_predicted is not None:
|
|
| 738 |
cols2 = st.columns(len(summary_cols2))
|
| 739 |
|
| 740 |
col_idx2 = 0
|
|
|
|
| 741 |
cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
|
| 742 |
col_idx2 += 1
|
| 743 |
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
|
|
@@ -745,7 +789,7 @@ if st.session_state.df_predicted is not None:
|
|
| 745 |
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
|
| 746 |
col_idx2 += 1
|
| 747 |
|
| 748 |
-
# Rentang
|
| 749 |
if available_cols['has_tahun']:
|
| 750 |
if 'tahun' in df_filtered.columns:
|
| 751 |
tahun_valid = df_filtered['tahun'].dropna()
|
|
@@ -763,7 +807,7 @@ if st.session_state.df_predicted is not None:
|
|
| 763 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 764 |
col_idx2 += 1
|
| 765 |
|
| 766 |
-
# Rata-rata
|
| 767 |
if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
|
| 768 |
try:
|
| 769 |
word_counts = df_filtered['kritik_saran'].astype(
|
|
@@ -784,9 +828,10 @@ if st.session_state.df_predicted is not None:
|
|
| 784 |
with col2:
|
| 785 |
show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
|
| 786 |
|
| 787 |
-
# Visualisasi berdasarkan kolom yang tersedia
|
| 788 |
viz_shown = False
|
| 789 |
|
|
|
|
| 790 |
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 791 |
col1, col2 = st.columns(2)
|
| 792 |
with col1:
|
|
@@ -800,19 +845,21 @@ if st.session_state.df_predicted is not None:
|
|
| 800 |
if result:
|
| 801 |
viz_shown = True
|
| 802 |
|
|
|
|
| 803 |
if available_cols['has_prodi']:
|
| 804 |
st.markdown("---")
|
| 805 |
result = show_prodi_distribution(df_filtered)
|
| 806 |
if result:
|
| 807 |
viz_shown = True
|
| 808 |
|
|
|
|
| 809 |
if available_cols['has_matkul']:
|
| 810 |
st.markdown("---")
|
| 811 |
result = show_top10_matkul_distribution(df_filtered)
|
| 812 |
if result:
|
| 813 |
viz_shown = True
|
| 814 |
|
| 815 |
-
# Sentimen per
|
| 816 |
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 817 |
st.markdown("---")
|
| 818 |
col1, col2 = st.columns(2)
|
|
@@ -827,19 +874,21 @@ if st.session_state.df_predicted is not None:
|
|
| 827 |
if result:
|
| 828 |
viz_shown = True
|
| 829 |
|
|
|
|
| 830 |
if available_cols['has_prodi']:
|
| 831 |
st.markdown("---")
|
| 832 |
result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
|
| 833 |
if result:
|
| 834 |
viz_shown = True
|
| 835 |
|
|
|
|
| 836 |
if available_cols['has_matkul']:
|
| 837 |
st.markdown("---")
|
| 838 |
result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
|
| 839 |
if result:
|
| 840 |
viz_shown = True
|
| 841 |
|
| 842 |
-
# Footer
|
| 843 |
st.caption("""
|
| 844 |
<div class='footer'>
|
| 845 |
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
|
|
|
| 34 |
)
|
| 35 |
from preprocessing import text_preprocessing_pipeline
|
| 36 |
|
| 37 |
+
# Konfigurasi untuk chunked processing (membagi data besar menjadi bagian kecil)
|
| 38 |
CHUNK_SIZE = 2500
|
| 39 |
ENABLE_CHUNKED = True
|
| 40 |
CACHE_EXPIRY_HOURS = 24
|
| 41 |
|
| 42 |
+
# Buat direktori untuk menyimpan cache file
|
| 43 |
os.makedirs("chache_file", exist_ok=True)
|
| 44 |
os.makedirs("chache_file/sessions", exist_ok=True)
|
| 45 |
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
def get_session_id():
|
| 60 |
+
"""Generate atau ambil session ID untuk user - tetap ada meski refresh halaman"""
|
| 61 |
query_params = st.query_params
|
| 62 |
|
| 63 |
+
# Cek apakah session ID sudah ada di URL parameter
|
| 64 |
if "sid" in query_params:
|
| 65 |
sid = query_params["sid"]
|
| 66 |
st.session_state.session_id = sid
|
| 67 |
return sid
|
| 68 |
|
| 69 |
+
# Jika belum ada, buat session ID baru
|
| 70 |
if "session_id" not in st.session_state:
|
| 71 |
new_session_id = str(uuid.uuid4())
|
| 72 |
st.session_state.session_id = new_session_id
|
| 73 |
st.query_params["sid"] = new_session_id
|
| 74 |
return new_session_id
|
| 75 |
|
| 76 |
+
# Jika sudah ada di session state, gunakan yang existing
|
| 77 |
existing_id = st.session_state.session_id
|
| 78 |
st.query_params["sid"] = existing_id
|
| 79 |
return existing_id
|
| 80 |
|
| 81 |
|
| 82 |
def get_session_cache_dir():
|
| 83 |
+
"""Dapatkan direktori cache khusus untuk session ini"""
|
| 84 |
sid = get_session_id()
|
| 85 |
cache_dir = Path(f"chache_file/sessions/{sid}")
|
| 86 |
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
def get_session_chunks_dir():
|
| 91 |
+
"""Dapatkan direktori chunks khusus untuk session ini"""
|
| 92 |
chunks_dir = get_session_cache_dir() / "chunks"
|
| 93 |
chunks_dir.mkdir(parents=True, exist_ok=True)
|
| 94 |
return chunks_dir
|
| 95 |
|
| 96 |
|
| 97 |
def cleanup_old_sessions():
|
| 98 |
+
"""Hapus cache session yang sudah expired (lebih dari 24 jam)"""
|
| 99 |
sessions_dir = Path("chache_file/sessions")
|
| 100 |
if not sessions_dir.exists():
|
| 101 |
return
|
|
|
|
| 106 |
mod_time = session_dir.stat().st_mtime
|
| 107 |
age_hours = (current_time - mod_time) / 3600
|
| 108 |
|
| 109 |
+
# Hapus jika sudah lebih dari CACHE_EXPIRY_HOURS
|
| 110 |
if age_hours > CACHE_EXPIRY_HOURS:
|
| 111 |
try:
|
| 112 |
shutil.rmtree(session_dir)
|
|
|
|
| 115 |
print(f"Error deleting session {session_dir.name}: {e}")
|
| 116 |
|
| 117 |
|
| 118 |
+
# Jalankan cleanup saat aplikasi dimulai
|
| 119 |
cleanup_old_sessions()
|
| 120 |
|
| 121 |
|
| 122 |
@st.cache_resource(show_spinner=False)
|
| 123 |
def get_model_resources():
|
| 124 |
+
"""Memuat model dan tokenizer IndoBERT (di-cache agar tidak reload terus)"""
|
| 125 |
return load_model_and_tokenizer()
|
| 126 |
|
| 127 |
|
| 128 |
+
# Load model dan tokenizer dengan spinner
|
| 129 |
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
|
| 130 |
model, tokenizer, le, device = get_model_resources()
|
| 131 |
|
| 132 |
+
# Tampilkan notifikasi sukses sementara
|
| 133 |
success_placeholder = st.empty()
|
| 134 |
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
|
| 135 |
time.sleep(1)
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
def convert_df_to_excel(df):
|
| 140 |
+
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
|
| 141 |
output = BytesIO()
|
| 142 |
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
| 143 |
df.to_excel(writer, index=False)
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
def clear_memory():
|
| 148 |
+
"""Bersihkan memory cache untuk optimasi performa"""
|
| 149 |
gc.collect()
|
| 150 |
if torch.cuda.is_available():
|
| 151 |
torch.cuda.empty_cache()
|
|
|
|
| 154 |
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
|
| 155 |
"""
|
| 156 |
Memproses satu chunk data dengan batch processing.
|
| 157 |
+
Progress bar menunjukkan: Preprocessing 0-100%, lalu Predicting 0-100%
|
| 158 |
"""
|
| 159 |
+
# STEP 1: Preprocessing teks (0-100%)
|
| 160 |
cleaned_text_list = []
|
| 161 |
total_rows = len(chunk_dataframe)
|
| 162 |
|
|
|
|
| 164 |
clean_text = text_preprocessing_pipeline(str(raw_text))
|
| 165 |
cleaned_text_list.append(clean_text)
|
| 166 |
|
| 167 |
+
# Update progress bar setiap 50 baris
|
| 168 |
if idx % 50 == 0 or idx == total_rows - 1:
|
| 169 |
progress = (idx + 1) / total_rows
|
| 170 |
progress_bar.progress(progress)
|
|
|
|
| 177 |
f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
|
| 178 |
time.sleep(0.2)
|
| 179 |
|
| 180 |
+
# STEP 2: Batch Prediction dengan model (0-100%)
|
| 181 |
batch_sz = CONFIG.get("batch_size", 32)
|
| 182 |
num_sents = len(cleaned_text_list)
|
| 183 |
num_asps = len(ASPEK_COLUMNS)
|
| 184 |
|
| 185 |
+
# Siapkan dataset dan dataloader
|
| 186 |
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
|
| 187 |
tokenizer, CONFIG["max_len"])
|
| 188 |
dl = DataLoader(
|
|
|
|
| 192 |
num_workers=0
|
| 193 |
)
|
| 194 |
|
| 195 |
+
# Matrix untuk menyimpan hasil prediksi
|
| 196 |
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
|
| 197 |
|
| 198 |
batch_counter = 0
|
| 199 |
total_batch_count = len(dl)
|
| 200 |
|
| 201 |
+
# Proses prediksi batch demi batch
|
| 202 |
model.eval()
|
| 203 |
with torch.no_grad():
|
| 204 |
for batch_data in dl:
|
|
|
|
| 207 |
sent_idxs = batch_data['sent_idx'].numpy()
|
| 208 |
asp_idxs = batch_data['aspect_idx'].numpy()
|
| 209 |
|
| 210 |
+
# Prediksi dan konversi ke label
|
| 211 |
model_outputs = model(inp_ids, attn_mask)
|
| 212 |
probabilities = F.softmax(model_outputs, dim=1)
|
| 213 |
predicted_indices = torch.argmax(
|
| 214 |
probabilities, dim=1).cpu().numpy()
|
| 215 |
pred_labels = le.inverse_transform(predicted_indices)
|
| 216 |
|
| 217 |
+
# Simpan hasil prediksi ke matrix
|
| 218 |
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
|
| 219 |
predictions_matrix[s_idx][a_idx] = lbl
|
| 220 |
|
| 221 |
+
# Update progress bar
|
| 222 |
batch_counter += 1
|
| 223 |
progress = batch_counter / total_batch_count
|
| 224 |
progress_bar.progress(progress)
|
| 225 |
status_text.text(
|
| 226 |
f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
|
| 227 |
|
| 228 |
+
# STEP 3: Gabungkan hasil prediksi dengan data asli
|
| 229 |
result_list = []
|
| 230 |
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
|
| 231 |
row_dict = data_row.to_dict()
|
|
|
|
| 236 |
|
| 237 |
result_dataframe = pd.DataFrame(result_list)
|
| 238 |
|
| 239 |
+
# Simpan hasil chunk ke file CSV
|
| 240 |
chunks_directory = get_session_chunks_dir()
|
| 241 |
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
|
| 242 |
result_dataframe.to_csv(chunk_filepath, index=False)
|
| 243 |
|
| 244 |
+
# Progress selesai
|
| 245 |
progress_bar.progress(1.0)
|
| 246 |
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
|
| 247 |
|
|
|
|
| 251 |
|
| 252 |
|
| 253 |
def get_available_columns(df):
|
| 254 |
+
"""Deteksi kolom-kolom yang tersedia dalam dataframe untuk filter dan visualisasi dinamis"""
|
| 255 |
available = {
|
| 256 |
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
|
| 257 |
'has_semester': 'semester' in df.columns,
|
|
|
|
| 272 |
st.markdown(" ")
|
| 273 |
st.markdown(" ")
|
| 274 |
|
| 275 |
+
# Panduan pengunaan aplikasi
|
| 276 |
steps = [
|
| 277 |
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
|
| 278 |
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
|
|
|
|
| 299 |
st.markdown("")
|
| 300 |
st.markdown("")
|
| 301 |
|
| 302 |
+
# Upload file Excel
|
| 303 |
uploaded_file = st.file_uploader(
|
| 304 |
" Upload Data Kritik & Saran",
|
| 305 |
type=["xlsx"],
|
| 306 |
help="File maksimal 200MB dengan format .xlsx"
|
| 307 |
)
|
| 308 |
|
| 309 |
+
# Tombol untuk hapus cache - KHUSUS PER SESSION
|
| 310 |
session_cache_dir = get_session_cache_dir()
|
| 311 |
session_result_file = session_cache_dir / "temp_predicted.csv"
|
| 312 |
session_chunks_dir = get_session_chunks_dir()
|
| 313 |
|
| 314 |
+
# Tombol hapus cache data hasil prediksi
|
| 315 |
if session_result_file.exists():
|
| 316 |
if st.button("Hapus Cache Data"):
|
| 317 |
session_result_file.unlink()
|
|
|
|
| 319 |
time.sleep(1)
|
| 320 |
st.rerun()
|
| 321 |
|
| 322 |
+
# Tombol hapus cache chunks
|
| 323 |
if session_chunks_dir.exists():
|
| 324 |
chunk_files = list(session_chunks_dir.glob("*.csv"))
|
| 325 |
if chunk_files:
|
|
|
|
| 331 |
time.sleep(1)
|
| 332 |
st.rerun()
|
| 333 |
|
| 334 |
+
# Tampilkan info file yang di-cache jika ada
|
| 335 |
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
|
| 336 |
if not uploaded_file:
|
| 337 |
metadata_file = session_cache_dir / "metadata.txt"
|
|
|
|
| 353 |
st.caption(" ")
|
| 354 |
|
| 355 |
|
| 356 |
+
# Inisialisasi session state untuk menyimpan hasil prediksi
|
| 357 |
if "df_predicted" not in st.session_state:
|
| 358 |
st.session_state.df_predicted = None
|
| 359 |
|
| 360 |
+
# Load dari cache jika tersedia
|
| 361 |
if st.session_state.df_predicted is None and session_result_file.exists():
|
| 362 |
try:
|
| 363 |
df_cached = pd.read_csv(session_result_file)
|
|
|
|
| 370 |
st.warning(f"Gagal memuat cache: {e}")
|
| 371 |
|
| 372 |
|
| 373 |
+
# Proses file yang di-upload
|
| 374 |
if uploaded_file:
|
| 375 |
file_bytes = uploaded_file.getvalue()
|
| 376 |
+
# Cek apakah ini file baru atau file yang sama
|
| 377 |
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
|
| 378 |
st.session_state.last_uploaded_file = file_bytes
|
| 379 |
st.session_state.uploaded_filename = uploaded_file.name
|
| 380 |
try:
|
| 381 |
df_uploaded = pd.read_excel(BytesIO(file_bytes))
|
| 382 |
|
| 383 |
+
# Konversi kolom tahun jika ada
|
| 384 |
if "tahun" in df_uploaded.columns:
|
| 385 |
df_uploaded["tahun"] = pd.to_numeric(
|
| 386 |
df_uploaded["tahun"], errors='coerce').astype('Int64')
|
|
|
|
| 388 |
except ValueError as err:
|
| 389 |
st.error(f"Gagal membaca file: {err}")
|
| 390 |
else:
|
| 391 |
+
# Validasi kolom kritik_saran wajib ada
|
| 392 |
if "kritik_saran" not in df_uploaded.columns:
|
| 393 |
st.error("Kolom 'kritik_saran' tidak ditemukan.")
|
| 394 |
else:
|
| 395 |
+
# Hapus duplikasi berdasarkan kolom kritik_saran
|
| 396 |
df_uploaded = df_uploaded.drop_duplicates(
|
| 397 |
subset=["kritik_saran"])
|
| 398 |
+
# Tambahkan kolom aspek jika belum ada
|
| 399 |
for aspect_col in ASPEK_COLUMNS:
|
| 400 |
if aspect_col not in df_uploaded.columns:
|
| 401 |
df_uploaded[aspect_col] = None
|
|
|
|
| 403 |
st.markdown("### Preprocessing dan Prediksi")
|
| 404 |
|
| 405 |
total_rows = len(df_uploaded)
|
| 406 |
+
# Tentukan apakah menggunakan chunked processing atau tidak
|
| 407 |
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
|
| 408 |
|
| 409 |
if use_chunked:
|
| 410 |
+
# MODE CHUNKED PROCESSING untuk dataset besar
|
| 411 |
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
|
| 412 |
|
| 413 |
info_col1, info_col2, info_col3 = st.columns(3)
|
|
|
|
| 426 |
chunk_status_text = st.empty()
|
| 427 |
overall_status = st.empty()
|
| 428 |
|
| 429 |
+
# Proses setiap chunk
|
| 430 |
for start_idx in range(0, total_rows, CHUNK_SIZE):
|
| 431 |
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
|
| 432 |
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
|
|
|
|
| 435 |
current_chunk_file = session_chunks_dir / \
|
| 436 |
f"chunk_{current_chunk_number}.csv"
|
| 437 |
|
| 438 |
+
# Cek apakah chunk sudah pernah diproses (ada di cache)
|
| 439 |
if current_chunk_file.exists():
|
| 440 |
chunk_result = pd.read_csv(current_chunk_file)
|
| 441 |
all_chunk_results.append(chunk_result)
|
|
|
|
| 454 |
time.sleep(0.3)
|
| 455 |
continue
|
| 456 |
|
| 457 |
+
# Proses chunk baru
|
| 458 |
chunk_progress_bar.progress(0)
|
| 459 |
|
| 460 |
chunk_result = process_chunk_batch(
|
|
|
|
| 463 |
)
|
| 464 |
all_chunk_results.append(chunk_result)
|
| 465 |
|
| 466 |
+
# Hitung estimasi waktu tersisa
|
| 467 |
processed = min(start_idx + CHUNK_SIZE, total_rows)
|
| 468 |
progress_pct = (processed / total_rows) * 100
|
| 469 |
elapsed = time.time() - start_time
|
|
|
|
| 478 |
|
| 479 |
time.sleep(0.3)
|
| 480 |
|
| 481 |
+
# Gabungkan semua hasil chunk
|
| 482 |
chunk_status_text.empty()
|
| 483 |
overall_status.info("🔄 Menggabungkan semua chunks...")
|
| 484 |
df_session = pd.concat(
|
|
|
|
| 489 |
duration = end_time - start_time
|
| 490 |
|
| 491 |
else:
|
| 492 |
+
# MODE BATCH PROCESSING untuk dataset kecil
|
| 493 |
st.info(
|
| 494 |
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
|
| 495 |
|
|
|
|
| 498 |
progress_bar = st.progress(0)
|
| 499 |
status_text = st.empty()
|
| 500 |
|
| 501 |
+
# STEP 1: Preprocessing
|
| 502 |
cleaned_text_list = []
|
| 503 |
total_preprocessing = len(df_uploaded)
|
| 504 |
|
|
|
|
| 512 |
status_text.text(
|
| 513 |
f"Preprocessing: {idx+1}/{total_preprocessing} rows")
|
| 514 |
|
| 515 |
+
# STEP 2: Prediksi
|
| 516 |
progress_bar.progress(0)
|
| 517 |
status_text.text("Memulai prediksi...")
|
| 518 |
time.sleep(0.3)
|
|
|
|
| 556 |
status_text.text(
|
| 557 |
f"Predicting: {batch_counter}/{total_batch_count} batches")
|
| 558 |
|
| 559 |
+
# STEP 3: Gabungkan hasil
|
| 560 |
result_list = []
|
| 561 |
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
|
| 562 |
row_dict = data_row.to_dict()
|
|
|
|
| 576 |
end_time = time.time()
|
| 577 |
duration = end_time - start_time
|
| 578 |
|
| 579 |
+
# Simpan hasil ke session state dan cache file
|
| 580 |
st.session_state.df_predicted = df_session
|
| 581 |
df_session.to_csv(session_result_file, index=False)
|
| 582 |
|
| 583 |
+
# Simpan metadata nama file
|
| 584 |
metadata_file = session_cache_dir / "metadata.txt"
|
| 585 |
with open(metadata_file, "w", encoding="utf-8") as f:
|
| 586 |
f.write(uploaded_file.name)
|
| 587 |
|
| 588 |
+
# Hitung performa processing
|
| 589 |
total_items = total_rows * len(ASPEK_COLUMNS)
|
| 590 |
items_per_second = total_items / duration if duration > 0 else 0
|
| 591 |
|
| 592 |
+
# Tampilkan ringkasan hasil processing
|
| 593 |
if use_chunked:
|
| 594 |
st.success(
|
| 595 |
f"✅ **Chunked + Batch Processing selesai!**\n\n"
|
|
|
|
| 609 |
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
|
| 610 |
)
|
| 611 |
|
| 612 |
+
# Tampilan hasil prediksi dan visualisasi
|
| 613 |
if st.session_state.df_predicted is not None:
|
| 614 |
df_predicted = st.session_state.df_predicted
|
| 615 |
|
| 616 |
+
# Deteksi kolom yang tersedia untuk filter dinamis
|
| 617 |
available_cols = get_available_columns(df_predicted)
|
| 618 |
|
| 619 |
# Sidebar filter dengan pengecekan kolom dinamis
|
|
|
|
| 628 |
st.sidebar.info(
|
| 629 |
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
|
| 630 |
|
| 631 |
+
# Filter Mata Kuliah (jika kolom tersedia)
|
| 632 |
selected_matkul = []
|
| 633 |
if available_cols['has_matkul']:
|
| 634 |
matkul_options = sorted(
|
|
|
|
| 637 |
selected_matkul = st.sidebar.multiselect(
|
| 638 |
"Nama Mata Kuliah", matkul_options, default=matkul_options)
|
| 639 |
|
| 640 |
+
# Filter Program Studi (jika kolom tersedia)
|
| 641 |
selected_prodi = []
|
| 642 |
if available_cols['has_prodi']:
|
| 643 |
prodi_options = sorted(
|
|
|
|
| 646 |
selected_prodi = st.sidebar.multiselect(
|
| 647 |
"Program Studi", prodi_options, default=prodi_options)
|
| 648 |
|
| 649 |
+
# Filter Tahun (jika kolom tersedia)
|
| 650 |
selected_tahun = []
|
| 651 |
if available_cols['has_tahun']:
|
| 652 |
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
|
|
|
|
| 659 |
selected_tahun = st.sidebar.multiselect(
|
| 660 |
"Tahun", tahun_options, default=tahun_options)
|
| 661 |
|
| 662 |
+
# Filter Semester (jika kolom tersedia)
|
| 663 |
selected_semester = []
|
| 664 |
if available_cols['has_semester']:
|
| 665 |
semester_options = sorted(
|
|
|
|
| 668 |
selected_semester = st.sidebar.multiselect(
|
| 669 |
"Semester", semester_options, default=semester_options)
|
| 670 |
|
| 671 |
+
# Terapkan semua filter yang dipilih
|
| 672 |
df_filtered = df_clean.copy()
|
| 673 |
|
| 674 |
if selected_matkul and available_cols['has_matkul']:
|
|
|
|
| 690 |
st.markdown("### Tabel Data Hasil Prediksi")
|
| 691 |
st.dataframe(df_filtered, width='stretch')
|
| 692 |
|
| 693 |
+
# Tombol download untuk data terfilter dan semua data
|
| 694 |
col_dl1, col_dl2 = st.columns(2)
|
| 695 |
with col_dl1:
|
| 696 |
st.download_button(
|
|
|
|
| 719 |
st.markdown("### Ringkasan Cepat")
|
| 720 |
st.markdown("")
|
| 721 |
|
| 722 |
+
# Hitung total sentimen dari semua aspek
|
| 723 |
total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
|
| 724 |
total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
|
| 725 |
total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
|
| 726 |
|
| 727 |
+
# Tentukan kolom ringkasan berdasarkan data yang tersedia
|
| 728 |
summary_cols = []
|
| 729 |
|
| 730 |
# Kolom dasar (selalu ada)
|
| 731 |
summary_cols.extend(['ulasan', 'aspek'])
|
| 732 |
|
| 733 |
+
# Kolom opsional berdasarkan ketersediaan data
|
| 734 |
if available_cols['has_matkul']:
|
| 735 |
summary_cols.append('matkul')
|
| 736 |
if available_cols['has_prodi']:
|
|
|
|
| 738 |
if available_cols['has_semester']:
|
| 739 |
summary_cols.append('semester')
|
| 740 |
|
| 741 |
+
# Buat kolom dinamis untuk menampilkan metrik
|
| 742 |
num_cols = len(summary_cols)
|
| 743 |
cols = st.columns(num_cols)
|
| 744 |
|
| 745 |
col_idx = 0
|
| 746 |
|
| 747 |
+
# Metrik dasar: Jumlah Ulasan & Aspek
|
| 748 |
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
|
| 749 |
col_idx += 1
|
| 750 |
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
|
| 751 |
col_idx += 1
|
| 752 |
|
| 753 |
+
# Metrik Mata Kuliah (jika tersedia)
|
| 754 |
if available_cols['has_matkul']:
|
| 755 |
matkul_count = df_filtered['nama_matakuliah'].nunique()
|
| 756 |
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
|
| 757 |
col_idx += 1
|
| 758 |
|
| 759 |
+
# Metrik Prodi (jika tersedia)
|
| 760 |
if available_cols['has_prodi']:
|
| 761 |
prodi_count = df_filtered['nama_prodi'].nunique()
|
| 762 |
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
|
| 763 |
col_idx += 1
|
| 764 |
|
| 765 |
+
# Metrik Semester (jika tersedia)
|
| 766 |
if available_cols['has_semester']:
|
| 767 |
semester_count = df_filtered['semester'].nunique()
|
| 768 |
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
|
|
|
|
| 770 |
|
| 771 |
st.markdown("")
|
| 772 |
|
| 773 |
+
# Baris kedua: Metrik Sentimen dan info tambahan
|
| 774 |
summary_cols2 = ['positif', 'netral', 'negatif']
|
| 775 |
|
| 776 |
if available_cols['has_tahun']:
|
|
|
|
| 781 |
cols2 = st.columns(len(summary_cols2))
|
| 782 |
|
| 783 |
col_idx2 = 0
|
| 784 |
+
# Metrik untuk masing-masing jenis sentimen
|
| 785 |
cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
|
| 786 |
col_idx2 += 1
|
| 787 |
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
|
|
|
|
| 789 |
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
|
| 790 |
col_idx2 += 1
|
| 791 |
|
| 792 |
+
# Metrik Rentang Tahun (jika tersedia)
|
| 793 |
if available_cols['has_tahun']:
|
| 794 |
if 'tahun' in df_filtered.columns:
|
| 795 |
tahun_valid = df_filtered['tahun'].dropna()
|
|
|
|
| 807 |
cols2[col_idx2].metric("Rentang Tahun", "N/A")
|
| 808 |
col_idx2 += 1
|
| 809 |
|
| 810 |
+
# Metrik Rata-rata Panjang Kata (jika tersedia)
|
| 811 |
if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
|
| 812 |
try:
|
| 813 |
word_counts = df_filtered['kritik_saran'].astype(
|
|
|
|
| 828 |
with col2:
|
| 829 |
show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
|
| 830 |
|
| 831 |
+
# Visualisasi distribusi berdasarkan kolom yang tersedia
|
| 832 |
viz_shown = False
|
| 833 |
|
| 834 |
+
# Visualisasi Tahun dan Semester (jika tersedia)
|
| 835 |
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 836 |
col1, col2 = st.columns(2)
|
| 837 |
with col1:
|
|
|
|
| 845 |
if result:
|
| 846 |
viz_shown = True
|
| 847 |
|
| 848 |
+
# Visualisasi Program Studi (jika tersedia)
|
| 849 |
if available_cols['has_prodi']:
|
| 850 |
st.markdown("---")
|
| 851 |
result = show_prodi_distribution(df_filtered)
|
| 852 |
if result:
|
| 853 |
viz_shown = True
|
| 854 |
|
| 855 |
+
# Visualisasi Top 10 Mata Kuliah (jika tersedia)
|
| 856 |
if available_cols['has_matkul']:
|
| 857 |
st.markdown("---")
|
| 858 |
result = show_top10_matkul_distribution(df_filtered)
|
| 859 |
if result:
|
| 860 |
viz_shown = True
|
| 861 |
|
| 862 |
+
# Visualisasi Sentimen per Tahun/Semester (jika tersedia)
|
| 863 |
if available_cols['has_tahun'] or available_cols['has_semester']:
|
| 864 |
st.markdown("---")
|
| 865 |
col1, col2 = st.columns(2)
|
|
|
|
| 874 |
if result:
|
| 875 |
viz_shown = True
|
| 876 |
|
| 877 |
+
# Visualisasi Sentimen per Program Studi (jika tersedia)
|
| 878 |
if available_cols['has_prodi']:
|
| 879 |
st.markdown("---")
|
| 880 |
result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
|
| 881 |
if result:
|
| 882 |
viz_shown = True
|
| 883 |
|
| 884 |
+
# Visualisasi Sentimen per Top 10 Mata Kuliah (jika tersedia)
|
| 885 |
if available_cols['has_matkul']:
|
| 886 |
st.markdown("---")
|
| 887 |
result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
|
| 888 |
if result:
|
| 889 |
viz_shown = True
|
| 890 |
|
| 891 |
+
# Footer aplikasi
|
| 892 |
st.caption("""
|
| 893 |
<div class='footer'>
|
| 894 |
© 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
|
visualization.py
CHANGED
|
@@ -13,12 +13,13 @@ import plotly.express as px
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
-
#
|
| 17 |
sentimen_palette = {
|
| 18 |
"netral": "#FFE24C",
|
| 19 |
"positif": "#4CFF72",
|
| 20 |
"negatif": "#FF4C4C"
|
| 21 |
}
|
|
|
|
| 22 |
category_order = ["netral", "positif", "negatif"]
|
| 23 |
|
| 24 |
# Konfigurasi Plotly
|
|
@@ -30,20 +31,24 @@ config_options = {
|
|
| 30 |
|
| 31 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 32 |
"""Menampilkan bar chart distribusi sentimen per aspek."""
|
|
|
|
| 33 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 34 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 35 |
return
|
| 36 |
|
|
|
|
| 37 |
df_long = df_predicted.melt(
|
| 38 |
value_vars=aspek_columns,
|
| 39 |
var_name="aspek",
|
| 40 |
value_name="sentimen"
|
| 41 |
)
|
|
|
|
| 42 |
df_long["sentimen"] = pd.Categorical(
|
| 43 |
df_long["sentimen"],
|
| 44 |
categories=category_order,
|
| 45 |
ordered=True
|
| 46 |
)
|
|
|
|
| 47 |
count_data = df_long.groupby(
|
| 48 |
["aspek", "sentimen"], observed=False
|
| 49 |
).size().reset_index(name="jumlah")
|
|
@@ -62,10 +67,12 @@ def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
|
| 62 |
|
| 63 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 64 |
"""Menampilkan pie chart distribusi total sentimen."""
|
|
|
|
| 65 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
| 66 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 67 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 68 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
|
|
|
| 69 |
fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
|
| 70 |
color="sentimen", color_discrete_map=sentimen_palette,
|
| 71 |
hole=0.3)
|
|
@@ -76,12 +83,13 @@ def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
|
| 76 |
|
| 77 |
def show_year_distribution(df):
|
| 78 |
"""Menampilkan distribusi jumlah kritik/saran per tahun."""
|
| 79 |
-
#
|
| 80 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 81 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 82 |
|
|
|
|
| 83 |
if 'tahun' not in df.columns:
|
| 84 |
-
return None
|
| 85 |
|
| 86 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 87 |
if df_tahun.empty:
|
|
@@ -121,6 +129,7 @@ def show_prodi_distribution(df):
|
|
| 121 |
|
| 122 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 123 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
|
|
|
| 124 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
| 125 |
fig = px.bar(
|
| 126 |
prodi_counts,
|
|
@@ -142,6 +151,7 @@ def show_top10_matkul_distribution(df):
|
|
| 142 |
if missing_cols:
|
| 143 |
return None
|
| 144 |
|
|
|
|
| 145 |
matkul_counts = (
|
| 146 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 147 |
.size()
|
|
@@ -149,6 +159,7 @@ def show_top10_matkul_distribution(df):
|
|
| 149 |
.sort_values(by='jumlah', ascending=False)
|
| 150 |
.head(10)
|
| 151 |
)
|
|
|
|
| 152 |
matkul_counts['label'] = (
|
| 153 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 154 |
matkul_counts['nama_matakuliah']
|
|
@@ -169,13 +180,14 @@ def show_top10_matkul_distribution(df):
|
|
| 169 |
|
| 170 |
def show_sentiment_by_year(df, aspek_columns):
|
| 171 |
"""Menampilkan distribusi sentimen per tahun."""
|
| 172 |
-
#
|
| 173 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 174 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 175 |
|
| 176 |
if 'tahun' not in df.columns:
|
| 177 |
return None
|
| 178 |
|
|
|
|
| 179 |
df_long = df.melt(id_vars=['tahun'],
|
| 180 |
value_vars=aspek_columns,
|
| 181 |
var_name='aspek',
|
|
@@ -230,13 +242,16 @@ def show_sentiment_by_prodi(df, aspek_columns):
|
|
| 230 |
.reset_index(name='jumlah')
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
total_per_prodi = (
|
| 234 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 235 |
.sum()
|
| 236 |
.sort_values(ascending=False)
|
| 237 |
)
|
|
|
|
| 238 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 239 |
|
|
|
|
| 240 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 241 |
prodi_sentiment['nama_prodi'],
|
| 242 |
categories=ordered_categories,
|
|
@@ -269,6 +284,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 269 |
if missing_cols:
|
| 270 |
return None
|
| 271 |
|
|
|
|
| 272 |
df_top10 = (
|
| 273 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 274 |
.size()
|
|
@@ -287,6 +303,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 287 |
value_name='sentimen'
|
| 288 |
)
|
| 289 |
|
|
|
|
| 290 |
df_long['label'] = (
|
| 291 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 292 |
)
|
|
@@ -297,6 +314,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
|
|
| 297 |
.reset_index(name='jumlah')
|
| 298 |
)
|
| 299 |
|
|
|
|
| 300 |
total_per_label = (
|
| 301 |
matkul_sentiment.groupby('label')['jumlah']
|
| 302 |
.sum()
|
|
|
|
| 13 |
from config import ASPEK_COLUMNS
|
| 14 |
|
| 15 |
|
| 16 |
+
# Definisi warna untuk setiap kategori sentimen
|
| 17 |
sentimen_palette = {
|
| 18 |
"netral": "#FFE24C",
|
| 19 |
"positif": "#4CFF72",
|
| 20 |
"negatif": "#FF4C4C"
|
| 21 |
}
|
| 22 |
+
# Urutan kategori untuk konsistensi tampilan di semua chart
|
| 23 |
category_order = ["netral", "positif", "negatif"]
|
| 24 |
|
| 25 |
# Konfigurasi Plotly
|
|
|
|
| 31 |
|
| 32 |
def show_sentiment_bar_chart(df_predicted, aspek_columns):
|
| 33 |
"""Menampilkan bar chart distribusi sentimen per aspek."""
|
| 34 |
+
# Validasi data dan kolom yang diperlukan
|
| 35 |
if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
|
| 36 |
st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
|
| 37 |
return
|
| 38 |
|
| 39 |
+
# Transformasi dari wide ke long format untuk visualisasi
|
| 40 |
df_long = df_predicted.melt(
|
| 41 |
value_vars=aspek_columns,
|
| 42 |
var_name="aspek",
|
| 43 |
value_name="sentimen"
|
| 44 |
)
|
| 45 |
+
# Konversi ke categorical untuk memastikan urutan yang konsisten
|
| 46 |
df_long["sentimen"] = pd.Categorical(
|
| 47 |
df_long["sentimen"],
|
| 48 |
categories=category_order,
|
| 49 |
ordered=True
|
| 50 |
)
|
| 51 |
+
# Agregasi data untuk menghitung jumlah per aspek dan sentimen
|
| 52 |
count_data = df_long.groupby(
|
| 53 |
["aspek", "sentimen"], observed=False
|
| 54 |
).size().reset_index(name="jumlah")
|
|
|
|
| 67 |
|
| 68 |
def show_sentiment_pie_chart(df_predicted, aspek_columns):
|
| 69 |
"""Menampilkan pie chart distribusi total sentimen."""
|
| 70 |
+
# Flatten semua nilai sentimen dari semua aspek menjadi 1D array
|
| 71 |
sentimen_total = df_predicted[aspek_columns].values.ravel()
|
| 72 |
sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
|
| 73 |
sentimen_counts.columns = ["sentimen", "jumlah"]
|
| 74 |
sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
|
| 75 |
+
# Donut chart dengan hole parameter
|
| 76 |
fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
|
| 77 |
color="sentimen", color_discrete_map=sentimen_palette,
|
| 78 |
hole=0.3)
|
|
|
|
| 83 |
|
| 84 |
def show_year_distribution(df):
|
| 85 |
"""Menampilkan distribusi jumlah kritik/saran per tahun."""
|
| 86 |
+
# Ekstraksi tahun dari kolom tanggal jika kolom tahun tidak tersedia
|
| 87 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 88 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 89 |
|
| 90 |
+
# Return None jika tidak ada data tahun (untuk handling di pemanggil)
|
| 91 |
if 'tahun' not in df.columns:
|
| 92 |
+
return None
|
| 93 |
|
| 94 |
df_tahun = df.dropna(subset=['tahun']).copy()
|
| 95 |
if df_tahun.empty:
|
|
|
|
| 129 |
|
| 130 |
prodi_counts = df['nama_prodi'].value_counts().reset_index()
|
| 131 |
prodi_counts.columns = ['nama_prodi', 'jumlah']
|
| 132 |
+
# Sort ascending untuk horizontal bar (nilai kecil di bawah)
|
| 133 |
prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
|
| 134 |
fig = px.bar(
|
| 135 |
prodi_counts,
|
|
|
|
| 151 |
if missing_cols:
|
| 152 |
return None
|
| 153 |
|
| 154 |
+
# Groupby untuk menghitung frekuensi per mata kuliah
|
| 155 |
matkul_counts = (
|
| 156 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 157 |
.size()
|
|
|
|
| 159 |
.sort_values(by='jumlah', ascending=False)
|
| 160 |
.head(10)
|
| 161 |
)
|
| 162 |
+
# Gabungkan kode dan nama untuk label yang informatif
|
| 163 |
matkul_counts['label'] = (
|
| 164 |
matkul_counts['kode_matakuliah'] + " - " +
|
| 165 |
matkul_counts['nama_matakuliah']
|
|
|
|
| 180 |
|
| 181 |
def show_sentiment_by_year(df, aspek_columns):
|
| 182 |
"""Menampilkan distribusi sentimen per tahun."""
|
| 183 |
+
# Ekstraksi tahun dari kolom tanggal jika diperlukan
|
| 184 |
if 'tanggal' in df.columns and 'tahun' not in df.columns:
|
| 185 |
df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
|
| 186 |
|
| 187 |
if 'tahun' not in df.columns:
|
| 188 |
return None
|
| 189 |
|
| 190 |
+
# Transformasi ke long format dengan id_vars tahun
|
| 191 |
df_long = df.melt(id_vars=['tahun'],
|
| 192 |
value_vars=aspek_columns,
|
| 193 |
var_name='aspek',
|
|
|
|
| 242 |
.reset_index(name='jumlah')
|
| 243 |
)
|
| 244 |
|
| 245 |
+
# Hitung total per prodi untuk mengurutkan dari terbanyak ke sedikit
|
| 246 |
total_per_prodi = (
|
| 247 |
prodi_sentiment.groupby('nama_prodi')['jumlah']
|
| 248 |
.sum()
|
| 249 |
.sort_values(ascending=False)
|
| 250 |
)
|
| 251 |
+
# Reverse order untuk horizontal bar (nilai besar di atas)
|
| 252 |
ordered_categories = total_per_prodi.index.tolist()[::-1]
|
| 253 |
|
| 254 |
+
# Konversi ke categorical untuk kontrol urutan tampilan
|
| 255 |
prodi_sentiment['nama_prodi'] = pd.Categorical(
|
| 256 |
prodi_sentiment['nama_prodi'],
|
| 257 |
categories=ordered_categories,
|
|
|
|
| 284 |
if missing_cols:
|
| 285 |
return None
|
| 286 |
|
| 287 |
+
# Filter top 10 mata kuliah berdasarkan frekuensi
|
| 288 |
df_top10 = (
|
| 289 |
df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
|
| 290 |
.size()
|
|
|
|
| 303 |
value_name='sentimen'
|
| 304 |
)
|
| 305 |
|
| 306 |
+
# Gabungkan kode dan nama untuk label
|
| 307 |
df_long['label'] = (
|
| 308 |
df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
|
| 309 |
)
|
|
|
|
| 314 |
.reset_index(name='jumlah')
|
| 315 |
)
|
| 316 |
|
| 317 |
+
# Urutkan berdasarkan total sentimen per mata kuliah
|
| 318 |
total_per_label = (
|
| 319 |
matkul_sentiment.groupby('label')['jumlah']
|
| 320 |
.sum()
|