zdannn2808 commited on
Commit
4ca46c3
·
verified ·
1 Parent(s): 9e73d6d

perbaiki model_utils.py and back app.py, visualization.py

Browse files
Files changed (3) hide show
  1. app.py +197 -288
  2. model_utils.py +85 -2
  3. visualization.py +86 -254
app.py CHANGED
@@ -6,8 +6,6 @@ berbasis aspek dari kritik dan saran mahasiswa.
6
  UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
7
  UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
8
  """
9
-
10
- # Import library yang diperlukan
11
  import os
12
  import time
13
  import gc
@@ -38,48 +36,41 @@ from visualization import (
38
  from preprocessing import text_preprocessing_pipeline
39
 
40
  # Konfigurasi untuk chunked processing
41
- CHUNK_SIZE = 2500 # Ukuran chunk untuk memproses data besar
42
- ENABLE_CHUNKED = True # Aktifkan mode chunked processing
43
- CACHE_EXPIRY_HOURS = 24 # Durasi cache sebelum dihapus otomatis
44
 
45
- # Membuat direktori cache jika belum ada
46
  os.makedirs("chache_file", exist_ok=True)
47
  os.makedirs("chache_file/sessions", exist_ok=True)
48
 
49
- # Konfigurasi halaman Streamlit
50
  st.set_page_config(
51
  page_title="ABSA IndoBERT",
52
  layout="wide",
53
  page_icon="💬"
54
  )
55
 
56
- # Load custom CSS untuk styling
57
  with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
58
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
59
  st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
60
 
61
 
62
  def get_session_id():
63
- """
64
- Generate atau retrieve session ID untuk user - PERSISTENT across refresh
65
- Menggunakan query params agar session tetap konsisten saat refresh
66
- """
67
  query_params = st.query_params
68
 
69
- # Cek jika sudah ada session ID di URL
70
  if "sid" in query_params:
71
  sid = query_params["sid"]
72
  st.session_state.session_id = sid
73
  return sid
74
 
75
- # Buat session ID baru jika belum ada
76
  if "session_id" not in st.session_state:
77
  new_session_id = str(uuid.uuid4())
78
  st.session_state.session_id = new_session_id
79
  st.query_params["sid"] = new_session_id
80
  return new_session_id
81
 
82
- # Gunakan session ID yang sudah ada
83
  existing_id = st.session_state.session_id
84
  st.query_params["sid"] = existing_id
85
  return existing_id
@@ -101,10 +92,7 @@ def get_session_chunks_dir():
101
 
102
 
103
  def cleanup_old_sessions():
104
- """
105
- Hapus session cache yang sudah expired (> 24 jam)
106
- Membersihkan cache lama untuk menghemat storage
107
- """
108
  sessions_dir = Path("chache_file/sessions")
109
  if not sessions_dir.exists():
110
  return
@@ -115,7 +103,6 @@ def cleanup_old_sessions():
115
  mod_time = session_dir.stat().st_mtime
116
  age_hours = (current_time - mod_time) / 3600
117
 
118
- # Hapus jika lebih dari 24 jam
119
  if age_hours > CACHE_EXPIRY_HOURS:
120
  try:
121
  shutil.rmtree(session_dir)
@@ -124,24 +111,18 @@ def cleanup_old_sessions():
124
  print(f"Error deleting session {session_dir.name}: {e}")
125
 
126
 
127
- # Jalankan cleanup saat aplikasi dimulai
128
  cleanup_old_sessions()
129
 
130
 
131
  @st.cache_resource(show_spinner=False)
132
  def get_model_resources():
133
- """
134
- Memuat model dan tokenizer IndoBERT
135
- Menggunakan cache agar model tidak dimuat ulang setiap kali
136
- """
137
  return load_model_and_tokenizer()
138
 
139
 
140
- # Load model dengan spinner
141
  with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
142
  model, tokenizer, le, device = get_model_resources()
143
 
144
- # Tampilkan notifikasi sukses sementara
145
  success_placeholder = st.empty()
146
  success_placeholder.success("Model dan tokenizer berhasil dimuat!")
147
  time.sleep(1)
@@ -149,7 +130,7 @@ success_placeholder.empty()
149
 
150
 
151
  def convert_df_to_excel(df):
152
- """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
153
  output = BytesIO()
154
  with pd.ExcelWriter(output, engine="openpyxl") as writer:
155
  df.to_excel(writer, index=False)
@@ -157,7 +138,7 @@ def convert_df_to_excel(df):
157
 
158
 
159
  def clear_memory():
160
- """Clear memory cache untuk menghemat RAM dan VRAM"""
161
  gc.collect()
162
  if torch.cuda.is_available():
163
  torch.cuda.empty_cache()
@@ -165,20 +146,8 @@ def clear_memory():
165
 
166
  def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
167
  """
168
- Memproses satu chunk data dengan batch processing
169
- STEP 1: Preprocessing teks (cleaning, normalisasi)
170
- STEP 2: Batch Prediction menggunakan model IndoBERT
171
- STEP 3: Combine results dan simpan ke file CSV
172
-
173
- Args:
174
- chunk_dataframe: Data chunk yang akan diproses
175
- chunk_num: Nomor chunk saat ini
176
- total_chunk_count: Total jumlah chunk
177
- progress_bar: Progress bar Streamlit
178
- status_text: Text status Streamlit
179
-
180
- Returns:
181
- result_dataframe: DataFrame hasil prediksi untuk chunk ini
182
  """
183
  # STEP 1: Preprocessing (0-100%)
184
  cleaned_text_list = []
@@ -188,7 +157,6 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
188
  clean_text = text_preprocessing_pipeline(str(raw_text))
189
  cleaned_text_list.append(clean_text)
190
 
191
- # Update progress bar setiap 50 baris
192
  if idx % 50 == 0 or idx == total_rows - 1:
193
  progress = (idx + 1) / total_rows
194
  progress_bar.progress(progress)
@@ -206,7 +174,6 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
206
  num_sents = len(cleaned_text_list)
207
  num_asps = len(ASPEK_COLUMNS)
208
 
209
- # Buat dataset dan dataloader
210
  ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
211
  tokenizer, CONFIG["max_len"])
212
  dl = DataLoader(
@@ -216,13 +183,11 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
216
  num_workers=0
217
  )
218
 
219
- # Matrix untuk menyimpan hasil prediksi
220
  predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
221
 
222
  batch_counter = 0
223
  total_batch_count = len(dl)
224
 
225
- # Lakukan prediksi batch demi batch
226
  model.eval()
227
  with torch.no_grad():
228
  for batch_data in dl:
@@ -231,18 +196,15 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
231
  sent_idxs = batch_data['sent_idx'].numpy()
232
  asp_idxs = batch_data['aspect_idx'].numpy()
233
 
234
- # Forward pass model
235
  model_outputs = model(inp_ids, attn_mask)
236
  probabilities = F.softmax(model_outputs, dim=1)
237
  predicted_indices = torch.argmax(
238
  probabilities, dim=1).cpu().numpy()
239
  pred_labels = le.inverse_transform(predicted_indices)
240
 
241
- # Simpan hasil prediksi ke matrix
242
  for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
243
  predictions_matrix[s_idx][a_idx] = lbl
244
 
245
- # Update progress bar
246
  batch_counter += 1
247
  progress = batch_counter / total_batch_count
248
  progress_bar.progress(progress)
@@ -254,14 +216,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
254
  for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
255
  row_dict = data_row.to_dict()
256
  row_dict["kritik_saran"] = cleaned_text_list[idx]
257
- # Tambahkan hasil prediksi untuk setiap aspek
258
  for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
259
  row_dict[asp_name] = predictions_matrix[idx][asp_idx]
260
  result_list.append(row_dict)
261
 
262
  result_dataframe = pd.DataFrame(result_list)
263
 
264
- # Simpan chunk ke file CSV
265
  chunks_directory = get_session_chunks_dir()
266
  chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
267
  result_dataframe.to_csv(chunk_filepath, index=False)
@@ -270,17 +230,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
270
  progress_bar.progress(1.0)
271
  status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
272
 
273
- # Bersihkan memory
274
  clear_memory()
275
 
276
  return result_dataframe
277
 
278
 
279
  def get_available_columns(df):
280
- """
281
- Deteksi kolom-kolom yang tersedia dalam dataframe
282
- Untuk menentukan visualisasi mana yang bisa ditampilkan
283
- """
284
  available = {
285
  'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
286
  'has_semester': 'semester' in df.columns,
@@ -290,8 +246,6 @@ def get_available_columns(df):
290
  return available
291
 
292
 
293
- # ================== BAGIAN UI UTAMA ==================
294
-
295
  # Judul aplikasi
296
  st.markdown("""
297
  <h1 class='title-center'>ABSA IndoBERT</h1>
@@ -303,7 +257,7 @@ st.markdown(" ")
303
  st.markdown(" ")
304
  st.markdown(" ")
305
 
306
- # Panduan penggunaan aplikasi
307
  steps = [
308
  {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
309
  "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
@@ -315,7 +269,6 @@ steps = [
315
  "description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
316
  ]
317
 
318
- # Tampilkan panduan dalam 4 kolom
319
  cols = st.columns(len(steps))
320
 
321
  for i, step in enumerate(steps):
@@ -331,19 +284,18 @@ for i, step in enumerate(steps):
331
  st.markdown("")
332
  st.markdown("")
333
 
334
- # Upload file Excel
335
  uploaded_file = st.file_uploader(
336
  " Upload Data Kritik & Saran",
337
  type=["xlsx"],
338
  help="File maksimal 200MB dengan format .xlsx"
339
  )
340
 
341
- # Tombol untuk menghapus cache (session-specific)
342
  session_cache_dir = get_session_cache_dir()
343
  session_result_file = session_cache_dir / "temp_predicted.csv"
344
  session_chunks_dir = get_session_chunks_dir()
345
 
346
- # Tombol hapus cache data utama
347
  if session_result_file.exists():
348
  if st.button("Hapus Cache Data"):
349
  session_result_file.unlink()
@@ -351,7 +303,6 @@ if session_result_file.exists():
351
  time.sleep(1)
352
  st.rerun()
353
 
354
- # Tombol hapus cache chunks
355
  if session_chunks_dir.exists():
356
  chunk_files = list(session_chunks_dir.glob("*.csv"))
357
  if chunk_files:
@@ -363,7 +314,6 @@ if session_chunks_dir.exists():
363
  time.sleep(1)
364
  st.rerun()
365
 
366
- # Tampilkan info file yang di-cache
367
  if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
368
  if not uploaded_file:
369
  metadata_file = session_cache_dir / "metadata.txt"
@@ -384,15 +334,13 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
384
  else:
385
  st.caption(" ")
386
 
387
- # Inisialisasi session state untuk hasil prediksi
388
  if "df_predicted" not in st.session_state:
389
  st.session_state.df_predicted = None
390
 
391
- # Load cache jika ada
392
  if st.session_state.df_predicted is None and session_result_file.exists():
393
  try:
394
  df_cached = pd.read_csv(session_result_file)
395
- # Konversi kolom tahun ke format yang benar
396
  if "tahun" in df_cached.columns:
397
  df_cached["tahun"] = pd.to_numeric(
398
  df_cached["tahun"], errors='coerce').astype('Int64')
@@ -402,20 +350,14 @@ if st.session_state.df_predicted is None and session_result_file.exists():
402
  st.warning(f"Gagal memuat cache: {e}")
403
 
404
 
405
- # ================== PROSES UPLOAD & PREDIKSI ==================
406
  if uploaded_file:
407
  file_bytes = uploaded_file.getvalue()
408
-
409
- # Cek apakah file baru atau sama dengan sebelumnya
410
  if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
411
  st.session_state.last_uploaded_file = file_bytes
412
  st.session_state.uploaded_filename = uploaded_file.name
413
-
414
  try:
415
- # Baca file Excel
416
  df_uploaded = pd.read_excel(BytesIO(file_bytes))
417
 
418
- # Konversi kolom tahun jika ada
419
  if "tahun" in df_uploaded.columns:
420
  df_uploaded["tahun"] = pd.to_numeric(
421
  df_uploaded["tahun"], errors='coerce').astype('Int64')
@@ -423,15 +365,11 @@ if uploaded_file:
423
  except ValueError as err:
424
  st.error(f"Gagal membaca file: {err}")
425
  else:
426
- # Validasi kolom wajib
427
  if "kritik_saran" not in df_uploaded.columns:
428
  st.error("Kolom 'kritik_saran' tidak ditemukan.")
429
  else:
430
- # Hapus duplikat berdasarkan kolom kritik_saran
431
  df_uploaded = df_uploaded.drop_duplicates(
432
  subset=["kritik_saran"])
433
-
434
- # Tambahkan kolom aspek jika belum ada
435
  for aspect_col in ASPEK_COLUMNS:
436
  if aspect_col not in df_uploaded.columns:
437
  df_uploaded[aspect_col] = None
@@ -441,11 +379,9 @@ if uploaded_file:
441
  total_rows = len(df_uploaded)
442
  use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
443
 
444
- # ============ MODE CHUNKED PROCESSING ============
445
  if use_chunked:
446
  num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
447
 
448
- # Tampilkan info processing
449
  info_col1, info_col2, info_col3 = st.columns(3)
450
  with info_col1:
451
  st.info(f"**Total data:** {total_rows:,} rows")
@@ -462,7 +398,6 @@ if uploaded_file:
462
  chunk_status_text = st.empty()
463
  overall_status = st.empty()
464
 
465
- # Proses setiap chunk
466
  for start_idx in range(0, total_rows, CHUNK_SIZE):
467
  current_chunk_number = (start_idx // CHUNK_SIZE) + 1
468
  current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
@@ -471,7 +406,6 @@ if uploaded_file:
471
  current_chunk_file = session_chunks_dir / \
472
  f"chunk_{current_chunk_number}.csv"
473
 
474
- # Cek apakah chunk sudah pernah diproses (ada di cache)
475
  if current_chunk_file.exists():
476
  chunk_result = pd.read_csv(current_chunk_file)
477
  all_chunk_results.append(chunk_result)
@@ -490,7 +424,6 @@ if uploaded_file:
490
  time.sleep(0.3)
491
  continue
492
 
493
- # Proses chunk baru
494
  chunk_progress_bar.progress(0)
495
 
496
  chunk_result = process_chunk_batch(
@@ -499,7 +432,6 @@ if uploaded_file:
499
  )
500
  all_chunk_results.append(chunk_result)
501
 
502
- # Hitung estimasi waktu
503
  processed = min(start_idx + CHUNK_SIZE, total_rows)
504
  progress_pct = (processed / total_rows) * 100
505
  elapsed = time.time() - start_time
@@ -514,7 +446,6 @@ if uploaded_file:
514
 
515
  time.sleep(0.3)
516
 
517
- # Gabungkan semua chunk
518
  chunk_status_text.empty()
519
  overall_status.info("🔄 Menggabungkan semua chunks...")
520
  df_session = pd.concat(
@@ -524,7 +455,6 @@ if uploaded_file:
524
  end_time = time.time()
525
  duration = end_time - start_time
526
 
527
- # ============ MODE BATCH PROCESSING (tanpa chunk) ============
528
  else:
529
  st.info(
530
  f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
@@ -534,7 +464,6 @@ if uploaded_file:
534
  progress_bar = st.progress(0)
535
  status_text = st.empty()
536
 
537
- # Preprocessing
538
  cleaned_text_list = []
539
  total_preprocessing = len(df_uploaded)
540
 
@@ -552,7 +481,6 @@ if uploaded_file:
552
  status_text.text("Memulai prediksi...")
553
  time.sleep(0.3)
554
 
555
- # Batch Prediction
556
  batch_sz = CONFIG.get("batch_size", 32)
557
  num_sents = len(cleaned_text_list)
558
  num_asps = len(ASPEK_COLUMNS)
@@ -592,7 +520,6 @@ if uploaded_file:
592
  status_text.text(
593
  f"Predicting: {batch_counter}/{total_batch_count} batches")
594
 
595
- # Combine results
596
  result_list = []
597
  for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
598
  row_dict = data_row.to_dict()
@@ -612,20 +539,16 @@ if uploaded_file:
612
  end_time = time.time()
613
  duration = end_time - start_time
614
 
615
- # Simpan hasil ke session state dan cache
616
  st.session_state.df_predicted = df_session
617
  df_session.to_csv(session_result_file, index=False)
618
 
619
- # Simpan metadata file
620
  metadata_file = session_cache_dir / "metadata.txt"
621
  with open(metadata_file, "w", encoding="utf-8") as f:
622
  f.write(uploaded_file.name)
623
 
624
- # Hitung statistik processing
625
  total_items = total_rows * len(ASPEK_COLUMNS)
626
  items_per_second = total_items / duration if duration > 0 else 0
627
 
628
- # Tampilkan hasil processing
629
  if use_chunked:
630
  st.success(
631
  f"✅ **Chunked + Batch Processing selesai!**\n\n"
@@ -645,14 +568,14 @@ if uploaded_file:
645
  f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
646
  )
647
 
648
- # ================== TAMPILAN HASIL & VISUALISASI ==================
649
  if st.session_state.df_predicted is not None:
650
  df_predicted = st.session_state.df_predicted
651
 
652
- # Deteksi kolom yang tersedia dalam dataframe
653
  available_cols = get_available_columns(df_predicted)
654
 
655
- # ============ SIDEBAR FILTER ============
656
  st.sidebar.header("Filter Data")
657
 
658
  df_clean = df_predicted.copy()
@@ -664,7 +587,7 @@ if st.session_state.df_predicted is not None:
664
  st.sidebar.info(
665
  "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
666
 
667
- # Filter Mata Kuliah (jika ada)
668
  selected_matkul = []
669
  if available_cols['has_matkul']:
670
  matkul_options = sorted(
@@ -673,7 +596,7 @@ if st.session_state.df_predicted is not None:
673
  selected_matkul = st.sidebar.multiselect(
674
  "Nama Mata Kuliah", matkul_options, default=matkul_options)
675
 
676
- # Filter Program Studi (jika ada)
677
  selected_prodi = []
678
  if available_cols['has_prodi']:
679
  prodi_options = sorted(
@@ -682,10 +605,9 @@ if st.session_state.df_predicted is not None:
682
  selected_prodi = st.sidebar.multiselect(
683
  "Program Studi", prodi_options, default=prodi_options)
684
 
685
- # Filter Tahun (jika ada)
686
  selected_tahun = []
687
  if available_cols['has_tahun']:
688
- # Konversi tanggal ke tahun jika perlu
689
  if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
690
  df_clean['tahun'] = pd.to_datetime(
691
  df_clean['tanggal'], errors='coerce').dt.year
@@ -696,7 +618,7 @@ if st.session_state.df_predicted is not None:
696
  selected_tahun = st.sidebar.multiselect(
697
  "Tahun", tahun_options, default=tahun_options)
698
 
699
- # Filter Semester (jika ada)
700
  selected_semester = []
701
  if available_cols['has_semester']:
702
  semester_options = sorted(
@@ -705,7 +627,7 @@ if st.session_state.df_predicted is not None:
705
  selected_semester = st.sidebar.multiselect(
706
  "Semester", semester_options, default=semester_options)
707
 
708
- # Apply semua filter yang dipilih
709
  df_filtered = df_clean.copy()
710
 
711
  if selected_matkul and available_cols['has_matkul']:
@@ -714,130 +636,123 @@ if st.session_state.df_predicted is not None:
714
 
715
  if selected_prodi and available_cols['has_prodi']:
716
  df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
717
- selected_prodi
718
- if selected_prodi and available_cols['has_prodi']:
719
- df_filtered=df_filtered[df_filtered["nama_prodi"].isin(
720
- selected_prodi)]
721
-
722
- if selected_tahun and available_cols['has_tahun']:
723
- df_filtered=df_filtered[df_filtered["tahun"].isin(selected_tahun)]
724
-
725
- if selected_semester and available_cols['has_semester']:
726
- df_filtered=df_filtered[df_filtered["semester"].isin(
727
- selected_semester)]
728
-
729
- # ============ TAMPILAN TABEL HASIL ============
730
- st.markdown("### Tabel Data Hasil Prediksi")
731
- st.dataframe(df_filtered, width='stretch')
732
-
733
- # ============ TOMBOL DOWNLOAD ============
734
- col_dl1, col_dl2=st.columns(2)
735
- with col_dl1:
736
- # Download data terfilter
737
- st.download_button(
738
- label="Unduh Data Terfilter",
739
- data=convert_df_to_excel(df_filtered),
740
- file_name="hasil_prediksi_absa_filtered.xlsx",
741
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
742
- use_container_width=True
743
- )
744
-
745
- with col_dl2:
746
- # Download semua data tanpa filter
747
- st.download_button(
748
- label="Unduh Semua Data",
749
- data=convert_df_to_excel(df_predicted),
750
- file_name="hasil_prediksi_absa_all.xlsx",
751
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
752
- use_container_width=True
753
- )
754
-
755
- st.info(
756
- f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
757
- )
758
-
759
- # ============ RINGKASAN CEPAT ============
760
- st.markdown("")
761
- st.markdown("### Ringkasan Cepat")
762
- st.markdown("")
763
-
764
- # Hitung total sentimen dari semua aspek
765
- total_pos=(df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
766
- total_net=(df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
767
- total_neg=(df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
768
-
769
- # Tentukan kolom mana yang tersedia untuk ditampilkan
770
- summary_cols=[]
771
-
772
- # Kolom dasar (selalu ada)
773
- summary_cols.extend(['ulasan', 'aspek'])
774
-
775
- # Kolom opsional berdasarkan data yang tersedia
776
- if available_cols['has_matkul']:
777
- summary_cols.append('matkul')
778
- if available_cols['has_prodi']:
779
- summary_cols.append('prodi')
780
- if available_cols['has_semester']:
781
- summary_cols.append('semester')
782
 
783
- # Buat kolom dinamis berdasarkan jumlah metrik
784
- num_cols=len(summary_cols)
785
- cols=st.columns(num_cols)
 
786
 
787
- col_idx=0
 
 
788
 
789
- # Metrik: Ulasan & Aspek (selalu ada)
790
- cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
791
- col_idx += 1
792
- cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
793
- col_idx += 1
794
 
795
- # Metrik: Mata Kuliah (jika ada)
796
- if available_cols['has_matkul']:
797
- matkul_count=df_filtered['nama_matakuliah'].nunique()
798
- cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
799
- col_idx += 1
 
 
 
 
 
800
 
801
- # Metrik: Prodi (jika ada)
802
- if available_cols['has_prodi']:
803
- prodi_count=df_filtered['nama_prodi'].nunique()
804
- cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
805
- col_idx += 1
806
 
807
- # Metrik: Semester (jika ada)
808
- if available_cols['has_semester']:
809
- semester_count=df_filtered['semester'].nunique()
810
- cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
811
- col_idx += 1
812
 
813
- st.markdown("")
 
 
 
 
814
 
815
- # Baris kedua: Sentimen + info tambahan
816
- summary_cols2=['positif', 'netral', 'negatif']
 
 
 
817
 
818
- if available_cols['has_tahun']:
819
- summary_cols2.append('tahun')
820
- if 'kritik_saran' in df_filtered.columns:
821
- summary_cols2.append('kata')
822
-
823
- cols2=st.columns(len(summary_cols2))
824
-
825
- col_idx2=0
826
- # Metrik: Sentimen Positif, Netral, Negatif
827
- cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
828
- col_idx2 += 1
829
- cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
830
- col_idx2 += 1
831
- cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
832
- col_idx2 += 1
833
-
834
- # Metrik: Rentang tahun (jika ada)
835
- if available_cols['has_tahun']:
836
- if 'tahun' in df_filtered.columns:
837
- tahun_valid=df_filtered['tahun'].dropna()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
  if len(tahun_valid) > 0:
839
- tahun_min=int(tahun_valid.min())
840
- tahun_max=int(tahun_valid.max())
841
  if tahun_min == tahun_max:
842
  cols2[col_idx2].metric("Tahun", f"{tahun_min}")
843
  else:
@@ -845,95 +760,89 @@ if st.session_state.df_predicted is not None:
845
  "Rentang Tahun", f"{tahun_min} - {tahun_max}")
846
  else:
847
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
848
- else:
849
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
850
- col_idx2 += 1
851
 
852
- # Metrik: Rata-rata panjang kata (jika kolom kritik_saran ada)
853
- if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
854
- try:
855
- word_counts=df_filtered['kritik_saran'].astype(
856
  str).str.split().str.len()
857
- avg_word_count=round(word_counts.mean(), 1)
858
  cols2[col_idx2].metric(
859
  "Rata-rata Panjang Kata", f"{avg_word_count} kata")
860
- except Exception:
861
  cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
862
 
863
- # ============ VISUALISASI DATA ============
864
- st.markdown("---")
865
- st.markdown("### Visualisasi Data")
866
 
867
- # Visualisasi Sentimen Dasar (selalu ditampilkan)
868
- col1, col2=st.columns(2)
869
- with col1:
870
- show_sentiment_bar_chart(df_filtered, ASPEK_COLUMNS)
871
- with col2:
872
- show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
873
 
874
- # Visualisasi berdasarkan kolom yang tersedia
875
- viz_shown=False
876
 
877
- # Visualisasi: Distribusi Tahun & Semester
878
- if available_cols['has_tahun'] or available_cols['has_semester']:
879
- col1, col2=st.columns(2)
880
- with col1:
881
  if available_cols['has_tahun']:
882
- result=show_year_distribution(df_filtered)
883
  if result:
884
- viz_shown=True
885
- with col2:
886
  if available_cols['has_semester']:
887
- result=show_semester_distribution(df_filtered)
888
  if result:
889
- viz_shown=True
890
-
891
- # Visualisasi: Distribusi Prodi
892
- if available_cols['has_prodi']:
893
- st.markdown("---")
894
- result=show_prodi_distribution(df_filtered)
895
- if result:
896
- viz_shown=True
897
-
898
- # Visualisasi: Distribusi Top 10 Mata Kuliah
899
- if available_cols['has_matkul']:
900
- st.markdown("---")
901
- result=show_top10_matkul_distribution(df_filtered)
902
- if result:
903
- viz_shown=True
904
-
905
- # Visualisasi: Sentimen per Tahun/Semester
906
- if available_cols['has_tahun'] or available_cols['has_semester']:
907
- st.markdown("---")
908
- col1, col2=st.columns(2)
909
- with col1:
910
  if available_cols['has_tahun']:
911
- result=show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
912
  if result:
913
- viz_shown=True
914
- with col2:
915
  if available_cols['has_semester']:
916
- result=show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
917
  if result:
918
- viz_shown=True
919
-
920
- # Visualisasi: Sentimen per Prodi
921
- if available_cols['has_prodi']:
922
- st.markdown("---")
923
- result=show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
924
- if result:
925
- viz_shown=True
926
-
927
- # Visualisasi: Sentimen per Top 10 Mata Kuliah
928
- if available_cols['has_matkul']:
929
- st.markdown("---")
930
- result=show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
931
- if result:
932
- viz_shown=True
933
-
934
- # ============ FOOTER ============
935
- st.caption("""
936
  <div class='footer'>
937
- © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
938
  </div>
939
  """, unsafe_allow_html=True)
 
6
  UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
7
  UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
8
  """
 
 
9
  import os
10
  import time
11
  import gc
 
36
  from preprocessing import text_preprocessing_pipeline
37
 
38
  # Konfigurasi untuk chunked processing
39
+ CHUNK_SIZE = 2500
40
+ ENABLE_CHUNKED = True
41
+ CACHE_EXPIRY_HOURS = 24
42
 
 
43
  os.makedirs("chache_file", exist_ok=True)
44
  os.makedirs("chache_file/sessions", exist_ok=True)
45
 
46
+ # Konfigurasi halaman
47
  st.set_page_config(
48
  page_title="ABSA IndoBERT",
49
  layout="wide",
50
  page_icon="💬"
51
  )
52
 
53
+ # Load custom CSS
54
  with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
55
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
56
  st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/font/bootstrap-icons.css" rel="stylesheet">', unsafe_allow_html=True)
57
 
58
 
59
  def get_session_id():
60
+ """Generate atau retrieve session ID untuk user - PERSISTENT across refresh"""
 
 
 
61
  query_params = st.query_params
62
 
 
63
  if "sid" in query_params:
64
  sid = query_params["sid"]
65
  st.session_state.session_id = sid
66
  return sid
67
 
 
68
  if "session_id" not in st.session_state:
69
  new_session_id = str(uuid.uuid4())
70
  st.session_state.session_id = new_session_id
71
  st.query_params["sid"] = new_session_id
72
  return new_session_id
73
 
 
74
  existing_id = st.session_state.session_id
75
  st.query_params["sid"] = existing_id
76
  return existing_id
 
92
 
93
 
94
  def cleanup_old_sessions():
95
+ """Hapus session cache yang sudah expired (> 24 jam)"""
 
 
 
96
  sessions_dir = Path("chache_file/sessions")
97
  if not sessions_dir.exists():
98
  return
 
103
  mod_time = session_dir.stat().st_mtime
104
  age_hours = (current_time - mod_time) / 3600
105
 
 
106
  if age_hours > CACHE_EXPIRY_HOURS:
107
  try:
108
  shutil.rmtree(session_dir)
 
111
  print(f"Error deleting session {session_dir.name}: {e}")
112
 
113
 
 
114
  cleanup_old_sessions()
115
 
116
 
117
  @st.cache_resource(show_spinner=False)
118
  def get_model_resources():
119
+ """Memuat model dan tokenizer IndoBERT."""
 
 
 
120
  return load_model_and_tokenizer()
121
 
122
 
 
123
  with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
124
  model, tokenizer, le, device = get_model_resources()
125
 
 
126
  success_placeholder = st.empty()
127
  success_placeholder.success("Model dan tokenizer berhasil dimuat!")
128
  time.sleep(1)
 
130
 
131
 
132
  def convert_df_to_excel(df):
133
+ """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream."""
134
  output = BytesIO()
135
  with pd.ExcelWriter(output, engine="openpyxl") as writer:
136
  df.to_excel(writer, index=False)
 
138
 
139
 
140
  def clear_memory():
141
+ """Clear memory cache"""
142
  gc.collect()
143
  if torch.cuda.is_available():
144
  torch.cuda.empty_cache()
 
146
 
147
  def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
148
  """
149
+ Memproses satu chunk data dengan batch processing.
150
+ Progress bar: Preprocessing 0-100%, lalu Predicting 0-100%
 
 
 
 
 
 
 
 
 
 
 
 
151
  """
152
  # STEP 1: Preprocessing (0-100%)
153
  cleaned_text_list = []
 
157
  clean_text = text_preprocessing_pipeline(str(raw_text))
158
  cleaned_text_list.append(clean_text)
159
 
 
160
  if idx % 50 == 0 or idx == total_rows - 1:
161
  progress = (idx + 1) / total_rows
162
  progress_bar.progress(progress)
 
174
  num_sents = len(cleaned_text_list)
175
  num_asps = len(ASPEK_COLUMNS)
176
 
 
177
  ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
178
  tokenizer, CONFIG["max_len"])
179
  dl = DataLoader(
 
183
  num_workers=0
184
  )
185
 
 
186
  predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
187
 
188
  batch_counter = 0
189
  total_batch_count = len(dl)
190
 
 
191
  model.eval()
192
  with torch.no_grad():
193
  for batch_data in dl:
 
196
  sent_idxs = batch_data['sent_idx'].numpy()
197
  asp_idxs = batch_data['aspect_idx'].numpy()
198
 
 
199
  model_outputs = model(inp_ids, attn_mask)
200
  probabilities = F.softmax(model_outputs, dim=1)
201
  predicted_indices = torch.argmax(
202
  probabilities, dim=1).cpu().numpy()
203
  pred_labels = le.inverse_transform(predicted_indices)
204
 
 
205
  for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
206
  predictions_matrix[s_idx][a_idx] = lbl
207
 
 
208
  batch_counter += 1
209
  progress = batch_counter / total_batch_count
210
  progress_bar.progress(progress)
 
216
  for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
217
  row_dict = data_row.to_dict()
218
  row_dict["kritik_saran"] = cleaned_text_list[idx]
 
219
  for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
220
  row_dict[asp_name] = predictions_matrix[idx][asp_idx]
221
  result_list.append(row_dict)
222
 
223
  result_dataframe = pd.DataFrame(result_list)
224
 
 
225
  chunks_directory = get_session_chunks_dir()
226
  chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
227
  result_dataframe.to_csv(chunk_filepath, index=False)
 
230
  progress_bar.progress(1.0)
231
  status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
232
 
 
233
  clear_memory()
234
 
235
  return result_dataframe
236
 
237
 
238
  def get_available_columns(df):
239
+ """Deteksi kolom-kolom yang tersedia dalam dataframe"""
 
 
 
240
  available = {
241
  'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
242
  'has_semester': 'semester' in df.columns,
 
246
  return available
247
 
248
 
 
 
249
  # Judul aplikasi
250
  st.markdown("""
251
  <h1 class='title-center'>ABSA IndoBERT</h1>
 
257
  st.markdown(" ")
258
  st.markdown(" ")
259
 
260
+ # Panduan pengunaan
261
  steps = [
262
  {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
263
  "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
 
269
  "description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
270
  ]
271
 
 
272
  cols = st.columns(len(steps))
273
 
274
  for i, step in enumerate(steps):
 
284
  st.markdown("")
285
  st.markdown("")
286
 
287
+ # Upload file
288
  uploaded_file = st.file_uploader(
289
  " Upload Data Kritik & Saran",
290
  type=["xlsx"],
291
  help="File maksimal 200MB dengan format .xlsx"
292
  )
293
 
294
+ # Clear cache buttons - SESSION SPECIFIC
295
  session_cache_dir = get_session_cache_dir()
296
  session_result_file = session_cache_dir / "temp_predicted.csv"
297
  session_chunks_dir = get_session_chunks_dir()
298
 
 
299
  if session_result_file.exists():
300
  if st.button("Hapus Cache Data"):
301
  session_result_file.unlink()
 
303
  time.sleep(1)
304
  st.rerun()
305
 
 
306
  if session_chunks_dir.exists():
307
  chunk_files = list(session_chunks_dir.glob("*.csv"))
308
  if chunk_files:
 
314
  time.sleep(1)
315
  st.rerun()
316
 
 
317
  if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
318
  if not uploaded_file:
319
  metadata_file = session_cache_dir / "metadata.txt"
 
334
  else:
335
  st.caption(" ")
336
 
337
+
338
  if "df_predicted" not in st.session_state:
339
  st.session_state.df_predicted = None
340
 
 
341
  if st.session_state.df_predicted is None and session_result_file.exists():
342
  try:
343
  df_cached = pd.read_csv(session_result_file)
 
344
  if "tahun" in df_cached.columns:
345
  df_cached["tahun"] = pd.to_numeric(
346
  df_cached["tahun"], errors='coerce').astype('Int64')
 
350
  st.warning(f"Gagal memuat cache: {e}")
351
 
352
 
 
353
  if uploaded_file:
354
  file_bytes = uploaded_file.getvalue()
 
 
355
  if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
356
  st.session_state.last_uploaded_file = file_bytes
357
  st.session_state.uploaded_filename = uploaded_file.name
 
358
  try:
 
359
  df_uploaded = pd.read_excel(BytesIO(file_bytes))
360
 
 
361
  if "tahun" in df_uploaded.columns:
362
  df_uploaded["tahun"] = pd.to_numeric(
363
  df_uploaded["tahun"], errors='coerce').astype('Int64')
 
365
  except ValueError as err:
366
  st.error(f"Gagal membaca file: {err}")
367
  else:
 
368
  if "kritik_saran" not in df_uploaded.columns:
369
  st.error("Kolom 'kritik_saran' tidak ditemukan.")
370
  else:
 
371
  df_uploaded = df_uploaded.drop_duplicates(
372
  subset=["kritik_saran"])
 
 
373
  for aspect_col in ASPEK_COLUMNS:
374
  if aspect_col not in df_uploaded.columns:
375
  df_uploaded[aspect_col] = None
 
379
  total_rows = len(df_uploaded)
380
  use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
381
 
 
382
  if use_chunked:
383
  num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
384
 
 
385
  info_col1, info_col2, info_col3 = st.columns(3)
386
  with info_col1:
387
  st.info(f"**Total data:** {total_rows:,} rows")
 
398
  chunk_status_text = st.empty()
399
  overall_status = st.empty()
400
 
 
401
  for start_idx in range(0, total_rows, CHUNK_SIZE):
402
  current_chunk_number = (start_idx // CHUNK_SIZE) + 1
403
  current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
 
406
  current_chunk_file = session_chunks_dir / \
407
  f"chunk_{current_chunk_number}.csv"
408
 
 
409
  if current_chunk_file.exists():
410
  chunk_result = pd.read_csv(current_chunk_file)
411
  all_chunk_results.append(chunk_result)
 
424
  time.sleep(0.3)
425
  continue
426
 
 
427
  chunk_progress_bar.progress(0)
428
 
429
  chunk_result = process_chunk_batch(
 
432
  )
433
  all_chunk_results.append(chunk_result)
434
 
 
435
  processed = min(start_idx + CHUNK_SIZE, total_rows)
436
  progress_pct = (processed / total_rows) * 100
437
  elapsed = time.time() - start_time
 
446
 
447
  time.sleep(0.3)
448
 
 
449
  chunk_status_text.empty()
450
  overall_status.info("🔄 Menggabungkan semua chunks...")
451
  df_session = pd.concat(
 
455
  end_time = time.time()
456
  duration = end_time - start_time
457
 
 
458
  else:
459
  st.info(
460
  f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
 
464
  progress_bar = st.progress(0)
465
  status_text = st.empty()
466
 
 
467
  cleaned_text_list = []
468
  total_preprocessing = len(df_uploaded)
469
 
 
481
  status_text.text("Memulai prediksi...")
482
  time.sleep(0.3)
483
 
 
484
  batch_sz = CONFIG.get("batch_size", 32)
485
  num_sents = len(cleaned_text_list)
486
  num_asps = len(ASPEK_COLUMNS)
 
520
  status_text.text(
521
  f"Predicting: {batch_counter}/{total_batch_count} batches")
522
 
 
523
  result_list = []
524
  for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
525
  row_dict = data_row.to_dict()
 
539
  end_time = time.time()
540
  duration = end_time - start_time
541
 
 
542
  st.session_state.df_predicted = df_session
543
  df_session.to_csv(session_result_file, index=False)
544
 
 
545
  metadata_file = session_cache_dir / "metadata.txt"
546
  with open(metadata_file, "w", encoding="utf-8") as f:
547
  f.write(uploaded_file.name)
548
 
 
549
  total_items = total_rows * len(ASPEK_COLUMNS)
550
  items_per_second = total_items / duration if duration > 0 else 0
551
 
 
552
  if use_chunked:
553
  st.success(
554
  f"✅ **Chunked + Batch Processing selesai!**\n\n"
 
568
  f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
569
  )
570
 
571
+ # Setelah prediksi selesai
572
  if st.session_state.df_predicted is not None:
573
  df_predicted = st.session_state.df_predicted
574
 
575
+ # Deteksi kolom yang tersedia
576
  available_cols = get_available_columns(df_predicted)
577
 
578
+ # Sidebar filter dengan pengecekan kolom dinamis
579
  st.sidebar.header("Filter Data")
580
 
581
  df_clean = df_predicted.copy()
 
587
  st.sidebar.info(
588
  "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
589
 
590
+ # Filter Mata Kuliah
591
  selected_matkul = []
592
  if available_cols['has_matkul']:
593
  matkul_options = sorted(
 
596
  selected_matkul = st.sidebar.multiselect(
597
  "Nama Mata Kuliah", matkul_options, default=matkul_options)
598
 
599
+ # Filter Program Studi
600
  selected_prodi = []
601
  if available_cols['has_prodi']:
602
  prodi_options = sorted(
 
605
  selected_prodi = st.sidebar.multiselect(
606
  "Program Studi", prodi_options, default=prodi_options)
607
 
608
+ # Filter Tahun
609
  selected_tahun = []
610
  if available_cols['has_tahun']:
 
611
  if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
612
  df_clean['tahun'] = pd.to_datetime(
613
  df_clean['tanggal'], errors='coerce').dt.year
 
618
  selected_tahun = st.sidebar.multiselect(
619
  "Tahun", tahun_options, default=tahun_options)
620
 
621
+ # Filter Semester
622
  selected_semester = []
623
  if available_cols['has_semester']:
624
  semester_options = sorted(
 
627
  selected_semester = st.sidebar.multiselect(
628
  "Semester", semester_options, default=semester_options)
629
 
630
+ # Apply filters
631
  df_filtered = df_clean.copy()
632
 
633
  if selected_matkul and available_cols['has_matkul']:
 
636
 
637
  if selected_prodi and available_cols['has_prodi']:
638
  df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
639
+ selected_prodi)]
640
+
641
+ if selected_tahun and available_cols['has_tahun']:
642
+ df_filtered = df_filtered[df_filtered["tahun"].isin(selected_tahun)]
643
+
644
+ if selected_semester and available_cols['has_semester']:
645
+ df_filtered = df_filtered[df_filtered["semester"].isin(
646
+ selected_semester)]
647
+
648
+ # Tampilkan tabel hasil prediksi
649
+ st.markdown("### Tabel Data Hasil Prediksi")
650
+ st.dataframe(df_filtered, width='stretch')
651
+
652
+ # Download buttons
653
+ col_dl1, col_dl2 = st.columns(2)
654
+ with col_dl1:
655
+ st.download_button(
656
+ label="Unduh Data Terfilter",
657
+ data=convert_df_to_excel(df_filtered),
658
+ file_name="hasil_prediksi_absa_filtered.xlsx",
659
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
660
+ use_container_width=True
661
+ )
662
+
663
+ with col_dl2:
664
+ st.download_button(
665
+ label="Unduh Semua Data",
666
+ data=convert_df_to_excel(df_predicted),
667
+ file_name="hasil_prediksi_absa_all.xlsx",
668
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
669
+ use_container_width=True
670
+ )
671
+
672
+ st.info(
673
+ f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
674
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
675
 
676
+ # Ringkasan Cepat
677
+ st.markdown("")
678
+ st.markdown("### Ringkasan Cepat")
679
+ st.markdown("")
680
 
681
+ total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
682
+ total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
683
+ total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
684
 
685
+ # Hitung jumlah kolom yang tersedia untuk ringkasan
686
+ summary_cols = []
 
 
 
687
 
688
+ # Kolom dasar (selalu ada)
689
+ summary_cols.extend(['ulasan', 'aspek'])
690
+
691
+ # Kolom opsional
692
+ if available_cols['has_matkul']:
693
+ summary_cols.append('matkul')
694
+ if available_cols['has_prodi']:
695
+ summary_cols.append('prodi')
696
+ if available_cols['has_semester']:
697
+ summary_cols.append('semester')
698
 
699
+ # Buat kolom dinamis berdasarkan data yang tersedia
700
+ num_cols = len(summary_cols)
701
+ cols = st.columns(num_cols)
 
 
702
 
703
+ col_idx = 0
 
 
 
 
704
 
705
+ # Ulasan & Aspek (selalu ada)
706
+ cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
707
+ col_idx += 1
708
+ cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
709
+ col_idx += 1
710
 
711
+ # Mata Kuliah (jika ada)
712
+ if available_cols['has_matkul']:
713
+ matkul_count = df_filtered['nama_matakuliah'].nunique()
714
+ cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
715
+ col_idx += 1
716
 
717
+ # Prodi (jika ada)
718
+ if available_cols['has_prodi']:
719
+ prodi_count = df_filtered['nama_prodi'].nunique()
720
+ cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
721
+ col_idx += 1
722
+
723
+ # Semester (jika ada)
724
+ if available_cols['has_semester']:
725
+ semester_count = df_filtered['semester'].nunique()
726
+ cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
727
+ col_idx += 1
728
+
729
+ st.markdown("")
730
+
731
+ # Baris kedua: Sentimen + info tambahan
732
+ summary_cols2 = ['positif', 'netral', 'negatif']
733
+
734
+ if available_cols['has_tahun']:
735
+ summary_cols2.append('tahun')
736
+ if 'kritik_saran' in df_filtered.columns:
737
+ summary_cols2.append('kata')
738
+
739
+ cols2 = st.columns(len(summary_cols2))
740
+
741
+ col_idx2 = 0
742
+ cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
743
+ col_idx2 += 1
744
+ cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
745
+ col_idx2 += 1
746
+ cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
747
+ col_idx2 += 1
748
+
749
+ # Rentang tahun (jika ada)
750
+ if available_cols['has_tahun']:
751
+ if 'tahun' in df_filtered.columns:
752
+ tahun_valid = df_filtered['tahun'].dropna()
753
  if len(tahun_valid) > 0:
754
+ tahun_min = int(tahun_valid.min())
755
+ tahun_max = int(tahun_valid.max())
756
  if tahun_min == tahun_max:
757
  cols2[col_idx2].metric("Tahun", f"{tahun_min}")
758
  else:
 
760
  "Rentang Tahun", f"{tahun_min} - {tahun_max}")
761
  else:
762
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
763
+ else:
764
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
765
+ col_idx2 += 1
766
 
767
+ # Rata-rata panjang kata (jika ada)
768
+ if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
769
+ try:
770
+ word_counts = df_filtered['kritik_saran'].astype(
771
  str).str.split().str.len()
772
+ avg_word_count = round(word_counts.mean(), 1)
773
  cols2[col_idx2].metric(
774
  "Rata-rata Panjang Kata", f"{avg_word_count} kata")
775
+ except Exception:
776
  cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
777
 
778
+ st.markdown("---")
779
+ st.markdown("### Visualisasi Data")
 
780
 
781
+ # Visualisasi Sentimen (selalu ditampilkan)
782
+ col1, col2 = st.columns(2)
783
+ with col1:
784
+ show_sentiment_bar_chart(df_filtered, ASPEK_COLUMNS)
785
+ with col2:
786
+ show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
787
 
788
+ # Visualisasi berdasarkan kolom yang tersedia
789
+ viz_shown = False
790
 
791
+ if available_cols['has_tahun'] or available_cols['has_semester']:
792
+ col1, col2 = st.columns(2)
793
+ with col1:
 
794
  if available_cols['has_tahun']:
795
+ result = show_year_distribution(df_filtered)
796
  if result:
797
+ viz_shown = True
798
+ with col2:
799
  if available_cols['has_semester']:
800
+ result = show_semester_distribution(df_filtered)
801
  if result:
802
+ viz_shown = True
803
+
804
+ if available_cols['has_prodi']:
805
+ st.markdown("---")
806
+ result = show_prodi_distribution(df_filtered)
807
+ if result:
808
+ viz_shown = True
809
+
810
+ if available_cols['has_matkul']:
811
+ st.markdown("---")
812
+ result = show_top10_matkul_distribution(df_filtered)
813
+ if result:
814
+ viz_shown = True
815
+
816
+ # Sentimen per tahun/semester
817
+ if available_cols['has_tahun'] or available_cols['has_semester']:
818
+ st.markdown("---")
819
+ col1, col2 = st.columns(2)
820
+ with col1:
 
 
821
  if available_cols['has_tahun']:
822
+ result = show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
823
  if result:
824
+ viz_shown = True
825
+ with col2:
826
  if available_cols['has_semester']:
827
+ result = show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
828
  if result:
829
+ viz_shown = True
830
+
831
+ if available_cols['has_prodi']:
832
+ st.markdown("---")
833
+ result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
834
+ if result:
835
+ viz_shown = True
836
+
837
+ if available_cols['has_matkul']:
838
+ st.markdown("---")
839
+ result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
840
+ if result:
841
+ viz_shown = True
842
+
843
+ # Footer
844
+ st.caption("""
 
 
845
  <div class='footer'>
846
+ © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
847
  </div>
848
  """, unsafe_allow_html=True)
model_utils.py CHANGED
@@ -17,7 +17,7 @@ except ImportError:
17
  subprocess.check_call(['pip', 'install', 'scikit-learn'])
18
  from sklearn.preprocessing import LabelEncoder
19
 
20
- # Custom Dataset untuk batch processing
21
  class ABSADataset(Dataset):
22
  """
23
  Custom Dataset untuk ABSA batch processing.
@@ -294,4 +294,87 @@ def predict_multi_aspect(model, tokenizer, sentence, aspek_list, label_encoder,
294
  # Prediksi tanpa menghitung gradient (inference mode)
295
  with torch.no_grad():
296
  # Forward pass
297
- outputs = model( in
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  subprocess.check_call(['pip', 'install', 'scikit-learn'])
18
  from sklearn.preprocessing import LabelEncoder
19
 
20
+
21
  class ABSADataset(Dataset):
22
  """
23
  Custom Dataset untuk ABSA batch processing.
 
294
  # Prediksi tanpa menghitung gradient (inference mode)
295
  with torch.no_grad():
296
  # Forward pass
297
+ outputs = model(input_ids, attention_mask)
298
+ # Konversi logits ke probabilitas dengan softmax
299
+ probs = F.softmax(outputs, dim=1).squeeze()
300
+ # Ambil indeks dengan probabilitas tertinggi
301
+ idx = torch.argmax(probs).item()
302
+ # Konversi indeks ke label sentimen
303
+ label = label_encoder.inverse_transform([idx])[0]
304
+ # Simpan hasil
305
+ results[aspek] = label
306
+
307
+ return results
308
+
309
+
310
+ def predict_multi_aspect_batch(model, tokenizer, sentences, aspek_list, label_encoder, device, max_len, batch_size=None):
311
+ """
312
+ Melakukan prediksi sentimen untuk setiap aspek pada multiple kalimat menggunakan batch processing.
313
+ Lebih efisien untuk memproses banyak kalimat sekaligus.
314
+
315
+ Args:
316
+ model (nn.Module): Model ABSA yang sudah diload.
317
+ tokenizer (AutoTokenizer): Tokenizer IndoBERT.
318
+ sentences (list): List kalimat input.
319
+ aspek_list (list): Daftar aspek yang ingin diprediksi.
320
+ label_encoder (LabelEncoder): Encoder label.
321
+ device (torch.device): Device (cuda/cpu).
322
+ max_len (int): Panjang maksimum token.
323
+ batch_size (int, optional): Ukuran batch. Jika None, gunakan dari CONFIG.
324
+
325
+ Returns:
326
+ list: List of dict hasil prediksi [{aspek: label_sentimen}, ...].
327
+ """
328
+ # Set batch size dari CONFIG jika tidak diberikan
329
+ if batch_size is None:
330
+ batch_size = CONFIG.get("batch_size", 32)
331
+
332
+ # === BUAT DATASET DAN DATALOADER ===
333
+ # Dataset akan membuat kombinasi semua kalimat × semua aspek
334
+ dataset = ABSADataset(sentences, aspek_list, tokenizer, max_len)
335
+ dataloader = DataLoader(
336
+ dataset,
337
+ batch_size=batch_size, # Process dalam batch untuk efisiensi
338
+ shuffle=False, # Jangan shuffle untuk maintain urutan
339
+ num_workers=CONFIG.get("num_workers", 0)
340
+ )
341
+
342
+ # === INISIALISASI CONTAINER HASIL ===
343
+ num_sentences = len(sentences)
344
+ num_aspects = len(aspek_list)
345
+ # Buat matrix untuk menyimpan prediksi [num_sentences x num_aspects]
346
+ all_predictions = [[None] * num_aspects for _ in range(num_sentences)]
347
+
348
+ # === BATCH PREDICTION ===
349
+ model.eval() # Set model ke evaluation mode
350
+ with torch.no_grad(): # Nonaktifkan gradient calculation
351
+ for batch in dataloader:
352
+ # Pindahkan batch ke device
353
+ input_ids = batch['input_ids'].to(device)
354
+ attention_mask = batch['attention_mask'].to(device)
355
+ sent_indices = batch['sent_idx'].numpy()
356
+ aspect_indices = batch['aspect_idx'].numpy()
357
+
358
+ # Forward pass untuk seluruh batch
359
+ outputs = model(input_ids, attention_mask)
360
+ # Konversi logits ke probabilitas
361
+ probs = F.softmax(outputs, dim=1)
362
+ # Ambil indeks prediksi tertinggi
363
+ pred_indices = torch.argmax(probs, dim=1).cpu().numpy()
364
+
365
+ # Konversi indeks ke label sentimen
366
+ labels = label_encoder.inverse_transform(pred_indices)
367
+
368
+ # Simpan hasil ke matrix sesuai indeks aslinya
369
+ for i, (sent_idx, aspect_idx, label) in enumerate(zip(sent_indices, aspect_indices, labels)):
370
+ all_predictions[sent_idx][aspect_idx] = label
371
+
372
+ # === KONVERSI KE FORMAT DICTIONARY ===
373
+ results = []
374
+ for predictions in all_predictions:
375
+ # Buat dict {aspek: label} untuk setiap kalimat
376
+ result_dict = {aspek: label for aspek,
377
+ label in zip(aspek_list, predictions)}
378
+ results.append(result_dict)
379
+
380
+ return results
visualization.py CHANGED
@@ -13,241 +13,135 @@ import plotly.express as px
13
  from config import ASPEK_COLUMNS
14
 
15
 
16
- # Palet warna kustom untuk setiap kategori sentimen
17
  sentimen_palette = {
18
- "netral": "#FFE24C", # Kuning untuk netral
19
- "positif": "#4CFF72", # Hijau untuk positif
20
- "negatif": "#FF4C4C" # Merah untuk negatif
21
  }
22
-
23
- # Urutan kategori sentimen untuk konsistensi visualisasi
24
  category_order = ["netral", "positif", "negatif"]
25
 
26
- # Konfigurasi Plotly untuk interaktivitas chart
27
  config_options = {
28
- "scrollZoom": False, # Nonaktifkan zoom dengan scroll
29
- "displayModeBar": False # Sembunyikan toolbar Plotly
30
  }
31
 
32
 
33
  def show_sentiment_bar_chart(df_predicted, aspek_columns):
34
- """
35
- Menampilkan bar chart distribusi sentimen per aspek.
36
- Chart menampilkan jumlah setiap sentimen (positif/netral/negatif) untuk setiap aspek.
37
-
38
- Args:
39
- df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
40
- aspek_columns (list): List nama kolom aspek yang akan divisualisasikan
41
- """
42
- # Validasi: cek apakah data dan kolom aspek tersedia
43
  if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
44
  st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
45
  return
46
 
47
- # Transform data dari wide format ke long format untuk visualisasi
48
  df_long = df_predicted.melt(
49
  value_vars=aspek_columns,
50
  var_name="aspek",
51
  value_name="sentimen"
52
  )
53
-
54
- # Konversi sentimen ke categorical untuk sorting yang konsisten
55
  df_long["sentimen"] = pd.Categorical(
56
  df_long["sentimen"],
57
  categories=category_order,
58
  ordered=True
59
  )
60
-
61
- # Hitung jumlah setiap kombinasi aspek-sentimen
62
  count_data = df_long.groupby(
63
  ["aspek", "sentimen"], observed=False
64
  ).size().reset_index(name="jumlah")
65
-
66
- # Buat bar chart dengan Plotly
67
  fig = px.bar(
68
  count_data,
69
  x="aspek",
70
  y="jumlah",
71
  color="sentimen",
72
- barmode="group", # Bar dikelompokkan berdampingan
73
  color_discrete_map=sentimen_palette,
74
  category_orders={"sentimen": category_order}
75
  )
76
  fig.update_layout(title="Distribusi Sentimen per Aspek")
77
-
78
- # Tampilkan chart di Streamlit
79
  st.plotly_chart(fig, use_container_width=True, config=config_options)
80
 
81
 
82
  def show_sentiment_pie_chart(df_predicted, aspek_columns):
83
- """
84
- Menampilkan pie chart distribusi total sentimen dari semua aspek.
85
- Chart menampilkan proporsi keseluruhan sentimen dalam bentuk donut chart.
86
-
87
- Args:
88
- df_predicted (pd.DataFrame): DataFrame dengan hasil prediksi sentimen
89
- aspek_columns (list): List nama kolom aspek
90
- """
91
- # Flatten semua nilai sentimen dari semua aspek menjadi satu array
92
  sentimen_total = df_predicted[aspek_columns].values.ravel()
93
-
94
- # Hitung frekuensi setiap sentimen
95
  sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
96
  sentimen_counts.columns = ["sentimen", "jumlah"]
97
  sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
98
-
99
- # Buat pie chart (donut chart dengan hole=0.3)
100
- fig = px.pie(
101
- sentimen_counts,
102
- names="sentimen",
103
- values="jumlah",
104
- color="sentimen",
105
- color_discrete_map=sentimen_palette,
106
- hole=0.3 # Buat donut chart
107
- )
108
  fig.update_layout(title="Total Komposisi Sentimen")
109
-
110
- # Tampilkan persentase dan label di dalam chart
111
  fig.update_traces(textposition='inside', textinfo='percent+label')
112
-
113
  st.plotly_chart(fig, use_container_width=True, config=config_options)
114
 
115
 
116
  def show_year_distribution(df):
117
- """
118
- Menampilkan distribusi jumlah kritik/saran per tahun.
119
- Jika kolom 'tahun' tidak ada, akan mencoba ekstrak dari kolom 'tanggal'.
120
-
121
- Args:
122
- df (pd.DataFrame): DataFrame input
123
-
124
- Returns:
125
- bool/None: True jika berhasil, None jika kolom tidak tersedia
126
- """
127
- # Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
128
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
129
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
130
 
131
- # Validasi: return None jika tidak ada kolom tahun
132
  if 'tahun' not in df.columns:
133
- return None
134
 
135
- # Filter data yang memiliki nilai tahun valid
136
  df_tahun = df.dropna(subset=['tahun']).copy()
137
  if df_tahun.empty:
138
  return None
139
 
140
- # Konversi tahun ke integer
141
  df_tahun['tahun'] = df_tahun['tahun'].astype(int)
142
-
143
- # Hitung frekuensi per tahun
144
  year_counts = df_tahun['tahun'].value_counts().reset_index()
145
  year_counts.columns = ['tahun', 'jumlah']
146
  year_counts = year_counts.sort_values('jumlah', ascending=False)
147
 
148
- # Buat bar chart
149
- fig = px.bar(
150
- year_counts,
151
- x='tahun',
152
- y='jumlah',
153
- color='tahun',
154
- title="Distribusi Kritik/Saran per Tahun"
155
- )
156
- # Treat tahun sebagai kategori untuk menghindari interpolasi
157
  fig.update_layout(xaxis=dict(type='category'))
158
-
159
  st.plotly_chart(fig, use_container_width=True, config=config_options)
160
  return True
161
 
162
 
163
  def show_semester_distribution(df):
164
- """
165
- Menampilkan distribusi jumlah kritik/saran per semester.
166
-
167
- Args:
168
- df (pd.DataFrame): DataFrame input
169
-
170
- Returns:
171
- bool/None: True jika berhasil, None jika kolom tidak tersedia
172
- """
173
- # Validasi: cek apakah kolom semester ada
174
  if 'semester' not in df.columns:
175
  return None
176
 
177
- # Hitung frekuensi per semester
178
  semester_counts = df['semester'].value_counts().reset_index()
179
  semester_counts.columns = ['semester', 'jumlah']
180
  semester_counts = semester_counts.sort_values('jumlah', ascending=False)
181
-
182
- # Buat bar chart
183
- fig = px.bar(
184
- semester_counts,
185
- x='semester',
186
- y='jumlah',
187
- color='semester',
188
- title="Distribusi Kritik/Saran per Semester"
189
- )
190
- # Sort berdasarkan total descending
191
  fig.update_layout(xaxis=dict(categoryorder='total descending'))
192
-
193
  st.plotly_chart(fig, use_container_width=True, config=config_options)
194
  return True
195
 
196
 
197
  def show_prodi_distribution(df):
198
- """
199
- Menampilkan jumlah kritik/saran per program studi dalam bentuk horizontal bar chart.
200
-
201
- Args:
202
- df (pd.DataFrame): DataFrame input
203
-
204
- Returns:
205
- bool/None: True jika berhasil, None jika kolom tidak tersedia
206
- """
207
- # Validasi: cek apakah kolom nama_prodi ada
208
  if 'nama_prodi' not in df.columns:
209
  return None
210
 
211
- # Hitung frekuensi per program studi
212
  prodi_counts = df['nama_prodi'].value_counts().reset_index()
213
  prodi_counts.columns = ['nama_prodi', 'jumlah']
214
-
215
- # Sort ascending untuk horizontal bar (terbanyak di atas)
216
  prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
217
-
218
- # Buat horizontal bar chart
219
  fig = px.bar(
220
  prodi_counts,
221
  x='jumlah',
222
  y='nama_prodi',
223
- orientation='h', # Horizontal orientation
224
  color='jumlah',
225
  title="Jumlah Kritik/Saran per Program Studi"
226
  )
227
-
228
  st.plotly_chart(fig, use_container_width=True, config=config_options)
229
  return True
230
 
231
 
232
  def show_top10_matkul_distribution(df):
233
- """
234
- Menampilkan 10 mata kuliah dengan jumlah kritik/saran terbanyak.
235
- Format: [kode_matakuliah] - [nama_matakuliah]
236
-
237
- Args:
238
- df (pd.DataFrame): DataFrame input
239
-
240
- Returns:
241
- bool/None: True jika berhasil, None jika kolom tidak tersedia
242
- """
243
- # Validasi: cek apakah kolom yang diperlukan ada
244
  required_cols = ['nama_matakuliah', 'kode_matakuliah']
245
  missing_cols = [col for col in required_cols if col not in df.columns]
246
 
247
  if missing_cols:
248
  return None
249
 
250
- # Group by kode dan nama mata kuliah, ambil 10 teratas
251
  matkul_counts = (
252
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
253
  .size()
@@ -255,17 +149,12 @@ def show_top10_matkul_distribution(df):
255
  .sort_values(by='jumlah', ascending=False)
256
  .head(10)
257
  )
258
-
259
- # Buat label gabungan: "kode - nama"
260
  matkul_counts['label'] = (
261
  matkul_counts['kode_matakuliah'] + " - " +
262
  matkul_counts['nama_matakuliah']
263
  )
264
-
265
- # Sort ascending untuk horizontal bar (terbanyak di atas)
266
  matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
267
 
268
- # Buat horizontal bar chart
269
  fig = px.bar(
270
  matkul_counts,
271
  x='jumlah',
@@ -274,124 +163,60 @@ def show_top10_matkul_distribution(df):
274
  title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
275
  color='jumlah'
276
  )
277
-
278
  st.plotly_chart(fig, use_container_width=True, config=config_options)
279
  return True
280
 
281
 
282
  def show_sentiment_by_year(df, aspek_columns):
283
- """
284
- Menampilkan distribusi sentimen per tahun dalam bentuk grouped bar chart.
285
- Menunjukkan bagaimana sentimen berubah dari tahun ke tahun.
286
-
287
- Args:
288
- df (pd.DataFrame): DataFrame input
289
- aspek_columns (list): List nama kolom aspek
290
-
291
- Returns:
292
- bool/None: True jika berhasil, None jika kolom tidak tersedia
293
- """
294
- # Coba ekstrak tahun dari kolom tanggal jika kolom tahun tidak ada
295
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
296
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
297
 
298
- # Validasi: return None jika tidak ada kolom tahun
299
  if 'tahun' not in df.columns:
300
  return None
301
 
302
- # Transform data dari wide ke long format, keep tahun sebagai ID variable
303
- df_long = df.melt(
304
- id_vars=['tahun'],
305
- value_vars=aspek_columns,
306
- var_name='aspek',
307
- value_name='sentimen'
308
- )
309
-
310
- # Group by tahun dan sentimen, hitung frekuensi
311
  year_sentiment = df_long.groupby(
312
  ['tahun', 'sentimen'], observed=False
313
  ).size().reset_index(name='jumlah')
314
-
315
  year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
316
-
317
- # Buat grouped bar chart
318
- fig = px.bar(
319
- year_sentiment,
320
- x='tahun',
321
- y='jumlah',
322
- color='sentimen',
323
- barmode='group', # Bars dikelompokkan per tahun
324
- color_discrete_map=sentimen_palette
325
- )
326
  fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
327
-
328
  st.plotly_chart(fig, use_container_width=True, config=config_options)
329
  return True
330
 
331
 
332
  def show_sentiment_by_semester(df, aspek_columns):
333
- """
334
- Menampilkan distribusi sentimen per semester dalam bentuk grouped bar chart.
335
-
336
- Args:
337
- df (pd.DataFrame): DataFrame input
338
- aspek_columns (list): List nama kolom aspek
339
-
340
- Returns:
341
- bool/None: True jika berhasil, None jika kolom tidak tersedia
342
- """
343
- # Validasi: cek apakah kolom semester ada
344
  if 'semester' not in df.columns:
345
  return None
346
 
347
- # Transform data dari wide ke long format, keep semester sebagai ID variable
348
- df_long = df.melt(
349
- id_vars=['semester'],
350
- value_vars=aspek_columns,
351
- var_name='aspek',
352
- value_name='sentimen'
353
- )
354
-
355
- # Group by semester dan sentimen, hitung frekuensi
356
  semester_sentiment = df_long.groupby(
357
  ['semester', 'sentimen'], observed=False
358
  ).size().reset_index(name='jumlah')
359
-
360
  semester_sentiment = semester_sentiment.sort_values(
361
  'jumlah', ascending=False)
362
-
363
- # Buat grouped bar chart
364
- fig = px.bar(
365
- semester_sentiment,
366
- x='semester',
367
- y='jumlah',
368
- color='sentimen',
369
- barmode='group', # Bars dikelompokkan per semester
370
- color_discrete_map=sentimen_palette
371
- )
372
  fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
373
-
374
  st.plotly_chart(fig, use_container_width=True, config=config_options)
375
  return True
376
 
377
 
378
  def show_sentiment_by_prodi(df, aspek_columns):
379
- """
380
- Menampilkan distribusi sentimen per program studi dalam horizontal grouped bar chart.
381
- Program studi diurutkan berdasarkan total jumlah kritik/saran.
382
-
383
- Args:
384
- df (pd.DataFrame): DataFrame input
385
- aspek_columns (list): List nama kolom aspek
386
-
387
- Returns:
388
- bool/None: True jika berhasil, None jika kolom tidak tersedia
389
- """
390
- # Validasi: cek apakah kolom nama_prodi ada
391
  if 'nama_prodi' not in df.columns:
392
  return None
393
 
394
- # Transform data dari wide ke long format
395
  df_long = df.melt(
396
  id_vars=['nama_prodi'],
397
  value_vars=aspek_columns,
@@ -399,72 +224,51 @@ def show_sentiment_by_prodi(df, aspek_columns):
399
  value_name='sentimen'
400
  )
401
 
402
- # Group by prodi dan sentimen, hitung frekuensi
403
  prodi_sentiment = (
404
  df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
405
  .size()
406
  .reset_index(name='jumlah')
407
  )
408
 
409
- # Hitung total per prodi untuk sorting
410
  total_per_prodi = (
411
  prodi_sentiment.groupby('nama_prodi')['jumlah']
412
  .sum()
413
  .sort_values(ascending=False)
414
  )
415
-
416
- # Reverse order untuk horizontal bar (terbanyak di atas)
417
  ordered_categories = total_per_prodi.index.tolist()[::-1]
418
 
419
- # Konversi ke categorical untuk maintain order
420
  prodi_sentiment['nama_prodi'] = pd.Categorical(
421
  prodi_sentiment['nama_prodi'],
422
  categories=ordered_categories,
423
  ordered=True
424
  )
425
 
426
- # Buat horizontal grouped bar chart
427
  fig = px.bar(
428
  prodi_sentiment,
429
  y='nama_prodi',
430
  x='jumlah',
431
  color='sentimen',
432
  barmode='group',
433
- orientation='h', # Horizontal orientation
434
  color_discrete_map=sentimen_palette
435
  )
436
  fig.update_layout(
437
  title="Distribusi Sentimen per Program Studi",
438
- yaxis={
439
- 'categoryorder': 'array',
440
- 'categoryarray': ordered_categories
441
- }
442
  )
443
-
444
  st.plotly_chart(fig, use_container_width=True, config=config_options)
445
  return True
446
 
447
 
448
  def show_sentiment_by_top10_matkul(df, aspek_columns):
449
- """
450
- Menampilkan distribusi sentimen pada 10 mata kuliah dengan kritik/saran terbanyak.
451
- Chart menggunakan horizontal grouped bar, diurutkan berdasarkan total kritik/saran.
452
-
453
- Args:
454
- df (pd.DataFrame): DataFrame input
455
- aspek_columns (list): List nama kolom aspek
456
-
457
- Returns:
458
- bool/None: True jika berhasil, None jika kolom tidak tersedia
459
- """
460
- # Validasi: cek apakah kolom yang diperlukan ada
461
  required_cols = ['kode_matakuliah', 'nama_matakuliah']
462
  missing_cols = [col for col in required_cols if col not in df.columns]
463
 
464
  if missing_cols:
465
  return None
466
 
467
- # Identifikasi top 10 mata kuliah berdasarkan jumlah kritik/saran
468
  df_top10 = (
469
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
470
  .size()
@@ -473,11 +277,9 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
473
  .index
474
  )
475
 
476
- # Filter data hanya untuk top 10 mata kuliah
477
  df_filtered = df[df.set_index(
478
  ['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
479
 
480
- # Transform data dari wide ke long format
481
  df_long = df_filtered.melt(
482
  id_vars=['kode_matakuliah', 'nama_matakuliah'],
483
  value_vars=aspek_columns,
@@ -485,36 +287,29 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
485
  value_name='sentimen'
486
  )
487
 
488
- # Buat label gabungan: "kode - nama"
489
  df_long['label'] = (
490
  df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
491
  )
492
 
493
- # Group by label dan sentimen, hitung frekuensi
494
  matkul_sentiment = (
495
  df_long.groupby(['label', 'sentimen'], observed=False)
496
  .size()
497
  .reset_index(name='jumlah')
498
  )
499
 
500
- # Hitung total per label untuk sorting
501
  total_per_label = (
502
  matkul_sentiment.groupby('label')['jumlah']
503
  .sum()
504
  .sort_values(ascending=False)
505
  )
506
-
507
- # Reverse order untuk horizontal bar (terbanyak di atas)
508
  ordered_labels = total_per_label.index.tolist()[::-1]
509
 
510
- # Konversi ke categorical untuk maintain order
511
  matkul_sentiment['label'] = pd.Categorical(
512
  matkul_sentiment['label'],
513
  categories=ordered_labels,
514
  ordered=True
515
  )
516
 
517
- # Buat horizontal grouped bar chart
518
  fig = px.bar(
519
  matkul_sentiment,
520
  y='label',
@@ -526,11 +321,48 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
526
  )
527
  fig.update_layout(
528
  title="Distribusi Sentimen pada Top 10 Mata Kuliah",
529
- yaxis={
530
- 'categoryorder': 'array',
531
- 'categoryarray': ordered_labels
532
- }
533
  )
534
-
535
  st.plotly_chart(fig, use_container_width=True, config=config_options)
536
  return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from config import ASPEK_COLUMNS
14
 
15
 
16
+ # Palet warna kustom
17
  sentimen_palette = {
18
+ "netral": "#FFE24C",
19
+ "positif": "#4CFF72",
20
+ "negatif": "#FF4C4C"
21
  }
 
 
22
  category_order = ["netral", "positif", "negatif"]
23
 
24
+ # Konfigurasi Plotly
25
  config_options = {
26
+ "scrollZoom": False,
27
+ "displayModeBar": False
28
  }
29
 
30
 
31
  def show_sentiment_bar_chart(df_predicted, aspek_columns):
32
+ """Menampilkan bar chart distribusi sentimen per aspek."""
 
 
 
 
 
 
 
 
33
  if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
34
  st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
35
  return
36
 
 
37
  df_long = df_predicted.melt(
38
  value_vars=aspek_columns,
39
  var_name="aspek",
40
  value_name="sentimen"
41
  )
 
 
42
  df_long["sentimen"] = pd.Categorical(
43
  df_long["sentimen"],
44
  categories=category_order,
45
  ordered=True
46
  )
 
 
47
  count_data = df_long.groupby(
48
  ["aspek", "sentimen"], observed=False
49
  ).size().reset_index(name="jumlah")
 
 
50
  fig = px.bar(
51
  count_data,
52
  x="aspek",
53
  y="jumlah",
54
  color="sentimen",
55
+ barmode="group",
56
  color_discrete_map=sentimen_palette,
57
  category_orders={"sentimen": category_order}
58
  )
59
  fig.update_layout(title="Distribusi Sentimen per Aspek")
 
 
60
  st.plotly_chart(fig, use_container_width=True, config=config_options)
61
 
62
 
63
  def show_sentiment_pie_chart(df_predicted, aspek_columns):
64
+ """Menampilkan pie chart distribusi total sentimen."""
 
 
 
 
 
 
 
 
65
  sentimen_total = df_predicted[aspek_columns].values.ravel()
 
 
66
  sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
67
  sentimen_counts.columns = ["sentimen", "jumlah"]
68
  sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
69
+ fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
70
+ color="sentimen", color_discrete_map=sentimen_palette,
71
+ hole=0.3)
 
 
 
 
 
 
 
72
  fig.update_layout(title="Total Komposisi Sentimen")
 
 
73
  fig.update_traces(textposition='inside', textinfo='percent+label')
 
74
  st.plotly_chart(fig, use_container_width=True, config=config_options)
75
 
76
 
77
  def show_year_distribution(df):
78
+ """Menampilkan distribusi jumlah kritik/saran per tahun."""
79
+ # Coba ekstrak dari kolom tanggal jika ada
 
 
 
 
 
 
 
 
 
80
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
81
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
82
 
 
83
  if 'tahun' not in df.columns:
84
+ return None # Return None jika tidak ada kolom tahun
85
 
 
86
  df_tahun = df.dropna(subset=['tahun']).copy()
87
  if df_tahun.empty:
88
  return None
89
 
 
90
  df_tahun['tahun'] = df_tahun['tahun'].astype(int)
 
 
91
  year_counts = df_tahun['tahun'].value_counts().reset_index()
92
  year_counts.columns = ['tahun', 'jumlah']
93
  year_counts = year_counts.sort_values('jumlah', ascending=False)
94
 
95
+ fig = px.bar(year_counts, x='tahun', y='jumlah',
96
+ color='tahun', title="Distribusi Kritik/Saran per Tahun")
 
 
 
 
 
 
 
97
  fig.update_layout(xaxis=dict(type='category'))
 
98
  st.plotly_chart(fig, use_container_width=True, config=config_options)
99
  return True
100
 
101
 
102
  def show_semester_distribution(df):
103
+ """Menampilkan distribusi jumlah kritik/saran per semester."""
 
 
 
 
 
 
 
 
 
104
  if 'semester' not in df.columns:
105
  return None
106
 
 
107
  semester_counts = df['semester'].value_counts().reset_index()
108
  semester_counts.columns = ['semester', 'jumlah']
109
  semester_counts = semester_counts.sort_values('jumlah', ascending=False)
110
+ fig = px.bar(semester_counts, x='semester', y='jumlah',
111
+ color='semester', title="Distribusi Kritik/Saran per Semester")
 
 
 
 
 
 
 
 
112
  fig.update_layout(xaxis=dict(categoryorder='total descending'))
 
113
  st.plotly_chart(fig, use_container_width=True, config=config_options)
114
  return True
115
 
116
 
117
  def show_prodi_distribution(df):
118
+ """Menampilkan jumlah kritik/saran per program studi."""
 
 
 
 
 
 
 
 
 
119
  if 'nama_prodi' not in df.columns:
120
  return None
121
 
 
122
  prodi_counts = df['nama_prodi'].value_counts().reset_index()
123
  prodi_counts.columns = ['nama_prodi', 'jumlah']
 
 
124
  prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
 
 
125
  fig = px.bar(
126
  prodi_counts,
127
  x='jumlah',
128
  y='nama_prodi',
129
+ orientation='h',
130
  color='jumlah',
131
  title="Jumlah Kritik/Saran per Program Studi"
132
  )
 
133
  st.plotly_chart(fig, use_container_width=True, config=config_options)
134
  return True
135
 
136
 
137
  def show_top10_matkul_distribution(df):
138
+ """Menampilkan 10 mata kuliah dengan jumlah kritik/saran terbanyak."""
 
 
 
 
 
 
 
 
 
 
139
  required_cols = ['nama_matakuliah', 'kode_matakuliah']
140
  missing_cols = [col for col in required_cols if col not in df.columns]
141
 
142
  if missing_cols:
143
  return None
144
 
 
145
  matkul_counts = (
146
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
147
  .size()
 
149
  .sort_values(by='jumlah', ascending=False)
150
  .head(10)
151
  )
 
 
152
  matkul_counts['label'] = (
153
  matkul_counts['kode_matakuliah'] + " - " +
154
  matkul_counts['nama_matakuliah']
155
  )
 
 
156
  matkul_counts = matkul_counts.sort_values(by='jumlah', ascending=True)
157
 
 
158
  fig = px.bar(
159
  matkul_counts,
160
  x='jumlah',
 
163
  title="Top 10 Mata Kuliah Berdasarkan Kritik/Saran",
164
  color='jumlah'
165
  )
 
166
  st.plotly_chart(fig, use_container_width=True, config=config_options)
167
  return True
168
 
169
 
170
  def show_sentiment_by_year(df, aspek_columns):
171
+ """Menampilkan distribusi sentimen per tahun."""
172
+ # Coba ekstrak dari kolom tanggal jika ada
 
 
 
 
 
 
 
 
 
 
173
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
174
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
175
 
 
176
  if 'tahun' not in df.columns:
177
  return None
178
 
179
+ df_long = df.melt(id_vars=['tahun'],
180
+ value_vars=aspek_columns,
181
+ var_name='aspek',
182
+ value_name='sentimen')
 
 
 
 
 
183
  year_sentiment = df_long.groupby(
184
  ['tahun', 'sentimen'], observed=False
185
  ).size().reset_index(name='jumlah')
 
186
  year_sentiment = year_sentiment.sort_values('jumlah', ascending=False)
187
+ fig = px.bar(year_sentiment, x='tahun', y='jumlah', color='sentimen',
188
+ barmode='group', color_discrete_map=sentimen_palette)
 
 
 
 
 
 
 
 
189
  fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Tahun")
 
190
  st.plotly_chart(fig, use_container_width=True, config=config_options)
191
  return True
192
 
193
 
194
  def show_sentiment_by_semester(df, aspek_columns):
195
+ """Menampilkan distribusi sentimen per semester."""
 
 
 
 
 
 
 
 
 
 
196
  if 'semester' not in df.columns:
197
  return None
198
 
199
+ df_long = df.melt(id_vars=['semester'],
200
+ value_vars=aspek_columns,
201
+ var_name='aspek',
202
+ value_name='sentimen')
 
 
 
 
 
203
  semester_sentiment = df_long.groupby(
204
  ['semester', 'sentimen'], observed=False
205
  ).size().reset_index(name='jumlah')
 
206
  semester_sentiment = semester_sentiment.sort_values(
207
  'jumlah', ascending=False)
208
+ fig = px.bar(semester_sentiment, x='semester', y='jumlah', color='sentimen',
209
+ barmode='group', color_discrete_map=sentimen_palette)
 
 
 
 
 
 
 
 
210
  fig.update_layout(title="Distribusi Sentimen Kritik/Saran per Semester")
 
211
  st.plotly_chart(fig, use_container_width=True, config=config_options)
212
  return True
213
 
214
 
215
  def show_sentiment_by_prodi(df, aspek_columns):
216
+ """Menampilkan distribusi sentimen per program studi."""
 
 
 
 
 
 
 
 
 
 
 
217
  if 'nama_prodi' not in df.columns:
218
  return None
219
 
 
220
  df_long = df.melt(
221
  id_vars=['nama_prodi'],
222
  value_vars=aspek_columns,
 
224
  value_name='sentimen'
225
  )
226
 
 
227
  prodi_sentiment = (
228
  df_long.groupby(['nama_prodi', 'sentimen'], observed=False)
229
  .size()
230
  .reset_index(name='jumlah')
231
  )
232
 
 
233
  total_per_prodi = (
234
  prodi_sentiment.groupby('nama_prodi')['jumlah']
235
  .sum()
236
  .sort_values(ascending=False)
237
  )
 
 
238
  ordered_categories = total_per_prodi.index.tolist()[::-1]
239
 
 
240
  prodi_sentiment['nama_prodi'] = pd.Categorical(
241
  prodi_sentiment['nama_prodi'],
242
  categories=ordered_categories,
243
  ordered=True
244
  )
245
 
 
246
  fig = px.bar(
247
  prodi_sentiment,
248
  y='nama_prodi',
249
  x='jumlah',
250
  color='sentimen',
251
  barmode='group',
252
+ orientation='h',
253
  color_discrete_map=sentimen_palette
254
  )
255
  fig.update_layout(
256
  title="Distribusi Sentimen per Program Studi",
257
+ yaxis={'categoryorder': 'array',
258
+ 'categoryarray': ordered_categories}
 
 
259
  )
 
260
  st.plotly_chart(fig, use_container_width=True, config=config_options)
261
  return True
262
 
263
 
264
  def show_sentiment_by_top10_matkul(df, aspek_columns):
265
+ """Menampilkan distribusi sentimen pada 10 mata kuliah teratas."""
 
 
 
 
 
 
 
 
 
 
 
266
  required_cols = ['kode_matakuliah', 'nama_matakuliah']
267
  missing_cols = [col for col in required_cols if col not in df.columns]
268
 
269
  if missing_cols:
270
  return None
271
 
 
272
  df_top10 = (
273
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
274
  .size()
 
277
  .index
278
  )
279
 
 
280
  df_filtered = df[df.set_index(
281
  ['kode_matakuliah', 'nama_matakuliah']).index.isin(df_top10)]
282
 
 
283
  df_long = df_filtered.melt(
284
  id_vars=['kode_matakuliah', 'nama_matakuliah'],
285
  value_vars=aspek_columns,
 
287
  value_name='sentimen'
288
  )
289
 
 
290
  df_long['label'] = (
291
  df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
292
  )
293
 
 
294
  matkul_sentiment = (
295
  df_long.groupby(['label', 'sentimen'], observed=False)
296
  .size()
297
  .reset_index(name='jumlah')
298
  )
299
 
 
300
  total_per_label = (
301
  matkul_sentiment.groupby('label')['jumlah']
302
  .sum()
303
  .sort_values(ascending=False)
304
  )
 
 
305
  ordered_labels = total_per_label.index.tolist()[::-1]
306
 
 
307
  matkul_sentiment['label'] = pd.Categorical(
308
  matkul_sentiment['label'],
309
  categories=ordered_labels,
310
  ordered=True
311
  )
312
 
 
313
  fig = px.bar(
314
  matkul_sentiment,
315
  y='label',
 
321
  )
322
  fig.update_layout(
323
  title="Distribusi Sentimen pada Top 10 Mata Kuliah",
324
+ yaxis={'categoryorder': 'array', 'categoryarray': ordered_labels}
 
 
 
325
  )
 
326
  st.plotly_chart(fig, use_container_width=True, config=config_options)
327
  return True
328
+
329
+
330
+ def show_sentiment_stacked_percentage(df, aspek_columns):
331
+ """Menampilkan stacked bar chart dengan persentase sentimen per aspek."""
332
+
333
+ if df.empty or not set(aspek_columns).issubset(df.columns):
334
+ st.warning("Data atau kolom aspek tidak tersedia.")
335
+ return
336
+
337
+ df_long = df.melt(
338
+ value_vars=aspek_columns,
339
+ var_name="aspek",
340
+ value_name="sentimen"
341
+ )
342
+
343
+ # Hitung persentase
344
+ count_data = df_long.groupby(
345
+ ['aspek', 'sentimen']).size().reset_index(name='jumlah')
346
+ total_per_aspek = count_data.groupby('aspek')['jumlah'].sum().reset_index()
347
+ total_per_aspek.columns = ['aspek', 'total']
348
+ count_data = count_data.merge(total_per_aspek, on='aspek')
349
+ count_data['persentase'] = (
350
+ count_data['jumlah'] / count_data['total']) * 100
351
+
352
+ fig = px.bar(
353
+ count_data,
354
+ x="aspek",
355
+ y="persentase",
356
+ color="sentimen",
357
+ title="Persentase Distribusi Sentimen per Aspek",
358
+ color_discrete_map=sentimen_palette,
359
+ category_orders={
360
+ "sentimen": category_order,
361
+ "aspek": aspek_columns
362
+ }
363
+ )
364
+ fig.update_layout(
365
+ yaxis_title="Persentase (%)",
366
+ xaxis_title="Aspek"
367
+ )
368
+ st.plotly_chart(fig, use_container_width=True, config=config_options)