zdannn2808 commited on
Commit
aff4068
·
verified ·
1 Parent(s): 77a2742

add some comments

Browse files
Files changed (2) hide show
  1. app.py +87 -38
  2. visualization.py +22 -4
app.py CHANGED
@@ -34,11 +34,12 @@ from visualization import (
34
  )
35
  from preprocessing import text_preprocessing_pipeline
36
 
37
- # Konfigurasi untuk chunked processing
38
  CHUNK_SIZE = 2500
39
  ENABLE_CHUNKED = True
40
  CACHE_EXPIRY_HOURS = 24
41
 
 
42
  os.makedirs("chache_file", exist_ok=True)
43
  os.makedirs("chache_file/sessions", exist_ok=True)
44
 
@@ -56,27 +57,30 @@ st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/fon
56
 
57
 
58
  def get_session_id():
59
- """Generate atau retrieve session ID untuk user - PERSISTENT across refresh"""
60
  query_params = st.query_params
61
 
 
62
  if "sid" in query_params:
63
  sid = query_params["sid"]
64
  st.session_state.session_id = sid
65
  return sid
66
 
 
67
  if "session_id" not in st.session_state:
68
  new_session_id = str(uuid.uuid4())
69
  st.session_state.session_id = new_session_id
70
  st.query_params["sid"] = new_session_id
71
  return new_session_id
72
 
 
73
  existing_id = st.session_state.session_id
74
  st.query_params["sid"] = existing_id
75
  return existing_id
76
 
77
 
78
  def get_session_cache_dir():
79
- """Get direktori cache untuk session ini"""
80
  sid = get_session_id()
81
  cache_dir = Path(f"chache_file/sessions/{sid}")
82
  cache_dir.mkdir(parents=True, exist_ok=True)
@@ -84,14 +88,14 @@ def get_session_cache_dir():
84
 
85
 
86
  def get_session_chunks_dir():
87
- """Get direktori chunks untuk session ini"""
88
  chunks_dir = get_session_cache_dir() / "chunks"
89
  chunks_dir.mkdir(parents=True, exist_ok=True)
90
  return chunks_dir
91
 
92
 
93
  def cleanup_old_sessions():
94
- """Hapus session cache yang sudah expired (> 24 jam)"""
95
  sessions_dir = Path("chache_file/sessions")
96
  if not sessions_dir.exists():
97
  return
@@ -102,6 +106,7 @@ def cleanup_old_sessions():
102
  mod_time = session_dir.stat().st_mtime
103
  age_hours = (current_time - mod_time) / 3600
104
 
 
105
  if age_hours > CACHE_EXPIRY_HOURS:
106
  try:
107
  shutil.rmtree(session_dir)
@@ -110,18 +115,21 @@ def cleanup_old_sessions():
110
  print(f"Error deleting session {session_dir.name}: {e}")
111
 
112
 
 
113
  cleanup_old_sessions()
114
 
115
 
116
  @st.cache_resource(show_spinner=False)
117
  def get_model_resources():
118
- """Memuat model dan tokenizer IndoBERT."""
119
  return load_model_and_tokenizer()
120
 
121
 
 
122
  with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
123
  model, tokenizer, le, device = get_model_resources()
124
 
 
125
  success_placeholder = st.empty()
126
  success_placeholder.success("Model dan tokenizer berhasil dimuat!")
127
  time.sleep(1)
@@ -129,7 +137,7 @@ success_placeholder.empty()
129
 
130
 
131
  def convert_df_to_excel(df):
132
- """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream."""
133
  output = BytesIO()
134
  with pd.ExcelWriter(output, engine="openpyxl") as writer:
135
  df.to_excel(writer, index=False)
@@ -137,7 +145,7 @@ def convert_df_to_excel(df):
137
 
138
 
139
  def clear_memory():
140
- """Clear memory cache"""
141
  gc.collect()
142
  if torch.cuda.is_available():
143
  torch.cuda.empty_cache()
@@ -146,9 +154,9 @@ def clear_memory():
146
  def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
147
  """
148
  Memproses satu chunk data dengan batch processing.
149
- Progress bar: Preprocessing 0-100%, lalu Predicting 0-100%
150
  """
151
- # STEP 1: Preprocessing (0-100%)
152
  cleaned_text_list = []
153
  total_rows = len(chunk_dataframe)
154
 
@@ -156,6 +164,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
156
  clean_text = text_preprocessing_pipeline(str(raw_text))
157
  cleaned_text_list.append(clean_text)
158
 
 
159
  if idx % 50 == 0 or idx == total_rows - 1:
160
  progress = (idx + 1) / total_rows
161
  progress_bar.progress(progress)
@@ -168,11 +177,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
168
  f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
169
  time.sleep(0.2)
170
 
171
- # STEP 2: Batch Prediction (0-100%)
172
  batch_sz = CONFIG.get("batch_size", 32)
173
  num_sents = len(cleaned_text_list)
174
  num_asps = len(ASPEK_COLUMNS)
175
 
 
176
  ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
177
  tokenizer, CONFIG["max_len"])
178
  dl = DataLoader(
@@ -182,11 +192,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
182
  num_workers=0
183
  )
184
 
 
185
  predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
186
 
187
  batch_counter = 0
188
  total_batch_count = len(dl)
189
 
 
190
  model.eval()
191
  with torch.no_grad():
192
  for batch_data in dl:
@@ -195,22 +207,25 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
195
  sent_idxs = batch_data['sent_idx'].numpy()
196
  asp_idxs = batch_data['aspect_idx'].numpy()
197
 
 
198
  model_outputs = model(inp_ids, attn_mask)
199
  probabilities = F.softmax(model_outputs, dim=1)
200
  predicted_indices = torch.argmax(
201
  probabilities, dim=1).cpu().numpy()
202
  pred_labels = le.inverse_transform(predicted_indices)
203
 
 
204
  for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
205
  predictions_matrix[s_idx][a_idx] = lbl
206
 
 
207
  batch_counter += 1
208
  progress = batch_counter / total_batch_count
209
  progress_bar.progress(progress)
210
  status_text.text(
211
  f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
212
 
213
- # STEP 3: Combine results
214
  result_list = []
215
  for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
216
  row_dict = data_row.to_dict()
@@ -221,11 +236,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
221
 
222
  result_dataframe = pd.DataFrame(result_list)
223
 
 
224
  chunks_directory = get_session_chunks_dir()
225
  chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
226
  result_dataframe.to_csv(chunk_filepath, index=False)
227
 
228
- # Complete progress
229
  progress_bar.progress(1.0)
230
  status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
231
 
@@ -235,7 +251,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
235
 
236
 
237
  def get_available_columns(df):
238
- """Deteksi kolom-kolom yang tersedia dalam dataframe"""
239
  available = {
240
  'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
241
  'has_semester': 'semester' in df.columns,
@@ -256,7 +272,7 @@ st.markdown(" ")
256
  st.markdown(" ")
257
  st.markdown(" ")
258
 
259
- # Panduan pengunaan
260
  steps = [
261
  {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
262
  "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
@@ -283,18 +299,19 @@ for i, step in enumerate(steps):
283
  st.markdown("")
284
  st.markdown("")
285
 
286
- # Upload file
287
  uploaded_file = st.file_uploader(
288
  " Upload Data Kritik & Saran",
289
  type=["xlsx"],
290
  help="File maksimal 200MB dengan format .xlsx"
291
  )
292
 
293
- # Clear cache buttons - SESSION SPECIFIC
294
  session_cache_dir = get_session_cache_dir()
295
  session_result_file = session_cache_dir / "temp_predicted.csv"
296
  session_chunks_dir = get_session_chunks_dir()
297
 
 
298
  if session_result_file.exists():
299
  if st.button("Hapus Cache Data"):
300
  session_result_file.unlink()
@@ -302,6 +319,7 @@ if session_result_file.exists():
302
  time.sleep(1)
303
  st.rerun()
304
 
 
305
  if session_chunks_dir.exists():
306
  chunk_files = list(session_chunks_dir.glob("*.csv"))
307
  if chunk_files:
@@ -313,6 +331,7 @@ if session_chunks_dir.exists():
313
  time.sleep(1)
314
  st.rerun()
315
 
 
316
  if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
317
  if not uploaded_file:
318
  metadata_file = session_cache_dir / "metadata.txt"
@@ -334,9 +353,11 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
334
  st.caption(" ")
335
 
336
 
 
337
  if "df_predicted" not in st.session_state:
338
  st.session_state.df_predicted = None
339
 
 
340
  if st.session_state.df_predicted is None and session_result_file.exists():
341
  try:
342
  df_cached = pd.read_csv(session_result_file)
@@ -349,14 +370,17 @@ if st.session_state.df_predicted is None and session_result_file.exists():
349
  st.warning(f"Gagal memuat cache: {e}")
350
 
351
 
 
352
  if uploaded_file:
353
  file_bytes = uploaded_file.getvalue()
 
354
  if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
355
  st.session_state.last_uploaded_file = file_bytes
356
  st.session_state.uploaded_filename = uploaded_file.name
357
  try:
358
  df_uploaded = pd.read_excel(BytesIO(file_bytes))
359
 
 
360
  if "tahun" in df_uploaded.columns:
361
  df_uploaded["tahun"] = pd.to_numeric(
362
  df_uploaded["tahun"], errors='coerce').astype('Int64')
@@ -364,11 +388,14 @@ if uploaded_file:
364
  except ValueError as err:
365
  st.error(f"Gagal membaca file: {err}")
366
  else:
 
367
  if "kritik_saran" not in df_uploaded.columns:
368
  st.error("Kolom 'kritik_saran' tidak ditemukan.")
369
  else:
 
370
  df_uploaded = df_uploaded.drop_duplicates(
371
  subset=["kritik_saran"])
 
372
  for aspect_col in ASPEK_COLUMNS:
373
  if aspect_col not in df_uploaded.columns:
374
  df_uploaded[aspect_col] = None
@@ -376,9 +403,11 @@ if uploaded_file:
376
  st.markdown("### Preprocessing dan Prediksi")
377
 
378
  total_rows = len(df_uploaded)
 
379
  use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
380
 
381
  if use_chunked:
 
382
  num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
383
 
384
  info_col1, info_col2, info_col3 = st.columns(3)
@@ -397,6 +426,7 @@ if uploaded_file:
397
  chunk_status_text = st.empty()
398
  overall_status = st.empty()
399
 
 
400
  for start_idx in range(0, total_rows, CHUNK_SIZE):
401
  current_chunk_number = (start_idx // CHUNK_SIZE) + 1
402
  current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
@@ -405,6 +435,7 @@ if uploaded_file:
405
  current_chunk_file = session_chunks_dir / \
406
  f"chunk_{current_chunk_number}.csv"
407
 
 
408
  if current_chunk_file.exists():
409
  chunk_result = pd.read_csv(current_chunk_file)
410
  all_chunk_results.append(chunk_result)
@@ -423,6 +454,7 @@ if uploaded_file:
423
  time.sleep(0.3)
424
  continue
425
 
 
426
  chunk_progress_bar.progress(0)
427
 
428
  chunk_result = process_chunk_batch(
@@ -431,6 +463,7 @@ if uploaded_file:
431
  )
432
  all_chunk_results.append(chunk_result)
433
 
 
434
  processed = min(start_idx + CHUNK_SIZE, total_rows)
435
  progress_pct = (processed / total_rows) * 100
436
  elapsed = time.time() - start_time
@@ -445,6 +478,7 @@ if uploaded_file:
445
 
446
  time.sleep(0.3)
447
 
 
448
  chunk_status_text.empty()
449
  overall_status.info("🔄 Menggabungkan semua chunks...")
450
  df_session = pd.concat(
@@ -455,6 +489,7 @@ if uploaded_file:
455
  duration = end_time - start_time
456
 
457
  else:
 
458
  st.info(
459
  f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
460
 
@@ -463,6 +498,7 @@ if uploaded_file:
463
  progress_bar = st.progress(0)
464
  status_text = st.empty()
465
 
 
466
  cleaned_text_list = []
467
  total_preprocessing = len(df_uploaded)
468
 
@@ -476,6 +512,7 @@ if uploaded_file:
476
  status_text.text(
477
  f"Preprocessing: {idx+1}/{total_preprocessing} rows")
478
 
 
479
  progress_bar.progress(0)
480
  status_text.text("Memulai prediksi...")
481
  time.sleep(0.3)
@@ -519,6 +556,7 @@ if uploaded_file:
519
  status_text.text(
520
  f"Predicting: {batch_counter}/{total_batch_count} batches")
521
 
 
522
  result_list = []
523
  for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
524
  row_dict = data_row.to_dict()
@@ -538,16 +576,20 @@ if uploaded_file:
538
  end_time = time.time()
539
  duration = end_time - start_time
540
 
 
541
  st.session_state.df_predicted = df_session
542
  df_session.to_csv(session_result_file, index=False)
543
 
 
544
  metadata_file = session_cache_dir / "metadata.txt"
545
  with open(metadata_file, "w", encoding="utf-8") as f:
546
  f.write(uploaded_file.name)
547
 
 
548
  total_items = total_rows * len(ASPEK_COLUMNS)
549
  items_per_second = total_items / duration if duration > 0 else 0
550
 
 
551
  if use_chunked:
552
  st.success(
553
  f"✅ **Chunked + Batch Processing selesai!**\n\n"
@@ -567,11 +609,11 @@ if uploaded_file:
567
  f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
568
  )
569
 
570
- # Setelah prediksi selesai
571
  if st.session_state.df_predicted is not None:
572
  df_predicted = st.session_state.df_predicted
573
 
574
- # Deteksi kolom yang tersedia
575
  available_cols = get_available_columns(df_predicted)
576
 
577
  # Sidebar filter dengan pengecekan kolom dinamis
@@ -586,7 +628,7 @@ if st.session_state.df_predicted is not None:
586
  st.sidebar.info(
587
  "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
588
 
589
- # Filter Mata Kuliah
590
  selected_matkul = []
591
  if available_cols['has_matkul']:
592
  matkul_options = sorted(
@@ -595,7 +637,7 @@ if st.session_state.df_predicted is not None:
595
  selected_matkul = st.sidebar.multiselect(
596
  "Nama Mata Kuliah", matkul_options, default=matkul_options)
597
 
598
- # Filter Program Studi
599
  selected_prodi = []
600
  if available_cols['has_prodi']:
601
  prodi_options = sorted(
@@ -604,7 +646,7 @@ if st.session_state.df_predicted is not None:
604
  selected_prodi = st.sidebar.multiselect(
605
  "Program Studi", prodi_options, default=prodi_options)
606
 
607
- # Filter Tahun
608
  selected_tahun = []
609
  if available_cols['has_tahun']:
610
  if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
@@ -617,7 +659,7 @@ if st.session_state.df_predicted is not None:
617
  selected_tahun = st.sidebar.multiselect(
618
  "Tahun", tahun_options, default=tahun_options)
619
 
620
- # Filter Semester
621
  selected_semester = []
622
  if available_cols['has_semester']:
623
  semester_options = sorted(
@@ -626,7 +668,7 @@ if st.session_state.df_predicted is not None:
626
  selected_semester = st.sidebar.multiselect(
627
  "Semester", semester_options, default=semester_options)
628
 
629
- # Apply filters
630
  df_filtered = df_clean.copy()
631
 
632
  if selected_matkul and available_cols['has_matkul']:
@@ -648,7 +690,7 @@ if st.session_state.df_predicted is not None:
648
  st.markdown("### Tabel Data Hasil Prediksi")
649
  st.dataframe(df_filtered, width='stretch')
650
 
651
- # Download buttons
652
  col_dl1, col_dl2 = st.columns(2)
653
  with col_dl1:
654
  st.download_button(
@@ -677,17 +719,18 @@ if st.session_state.df_predicted is not None:
677
  st.markdown("### Ringkasan Cepat")
678
  st.markdown("")
679
 
 
680
  total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
681
  total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
682
  total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
683
 
684
- # Hitung jumlah kolom yang tersedia untuk ringkasan
685
  summary_cols = []
686
 
687
  # Kolom dasar (selalu ada)
688
  summary_cols.extend(['ulasan', 'aspek'])
689
 
690
- # Kolom opsional
691
  if available_cols['has_matkul']:
692
  summary_cols.append('matkul')
693
  if available_cols['has_prodi']:
@@ -695,31 +738,31 @@ if st.session_state.df_predicted is not None:
695
  if available_cols['has_semester']:
696
  summary_cols.append('semester')
697
 
698
- # Buat kolom dinamis berdasarkan data yang tersedia
699
  num_cols = len(summary_cols)
700
  cols = st.columns(num_cols)
701
 
702
  col_idx = 0
703
 
704
- # Ulasan & Aspek (selalu ada)
705
  cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
706
  col_idx += 1
707
  cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
708
  col_idx += 1
709
 
710
- # Mata Kuliah (jika ada)
711
  if available_cols['has_matkul']:
712
  matkul_count = df_filtered['nama_matakuliah'].nunique()
713
  cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
714
  col_idx += 1
715
 
716
- # Prodi (jika ada)
717
  if available_cols['has_prodi']:
718
  prodi_count = df_filtered['nama_prodi'].nunique()
719
  cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
720
  col_idx += 1
721
 
722
- # Semester (jika ada)
723
  if available_cols['has_semester']:
724
  semester_count = df_filtered['semester'].nunique()
725
  cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
@@ -727,7 +770,7 @@ if st.session_state.df_predicted is not None:
727
 
728
  st.markdown("")
729
 
730
- # Baris kedua: Sentimen + info tambahan
731
  summary_cols2 = ['positif', 'netral', 'negatif']
732
 
733
  if available_cols['has_tahun']:
@@ -738,6 +781,7 @@ if st.session_state.df_predicted is not None:
738
  cols2 = st.columns(len(summary_cols2))
739
 
740
  col_idx2 = 0
 
741
  cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
742
  col_idx2 += 1
743
  cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
@@ -745,7 +789,7 @@ if st.session_state.df_predicted is not None:
745
  cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
746
  col_idx2 += 1
747
 
748
- # Rentang tahun (jika ada)
749
  if available_cols['has_tahun']:
750
  if 'tahun' in df_filtered.columns:
751
  tahun_valid = df_filtered['tahun'].dropna()
@@ -763,7 +807,7 @@ if st.session_state.df_predicted is not None:
763
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
764
  col_idx2 += 1
765
 
766
- # Rata-rata panjang kata (jika ada)
767
  if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
768
  try:
769
  word_counts = df_filtered['kritik_saran'].astype(
@@ -784,9 +828,10 @@ if st.session_state.df_predicted is not None:
784
  with col2:
785
  show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
786
 
787
- # Visualisasi berdasarkan kolom yang tersedia
788
  viz_shown = False
789
 
 
790
  if available_cols['has_tahun'] or available_cols['has_semester']:
791
  col1, col2 = st.columns(2)
792
  with col1:
@@ -800,19 +845,21 @@ if st.session_state.df_predicted is not None:
800
  if result:
801
  viz_shown = True
802
 
 
803
  if available_cols['has_prodi']:
804
  st.markdown("---")
805
  result = show_prodi_distribution(df_filtered)
806
  if result:
807
  viz_shown = True
808
 
 
809
  if available_cols['has_matkul']:
810
  st.markdown("---")
811
  result = show_top10_matkul_distribution(df_filtered)
812
  if result:
813
  viz_shown = True
814
 
815
- # Sentimen per tahun/semester
816
  if available_cols['has_tahun'] or available_cols['has_semester']:
817
  st.markdown("---")
818
  col1, col2 = st.columns(2)
@@ -827,19 +874,21 @@ if st.session_state.df_predicted is not None:
827
  if result:
828
  viz_shown = True
829
 
 
830
  if available_cols['has_prodi']:
831
  st.markdown("---")
832
  result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
833
  if result:
834
  viz_shown = True
835
 
 
836
  if available_cols['has_matkul']:
837
  st.markdown("---")
838
  result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
839
  if result:
840
  viz_shown = True
841
 
842
- # Footer
843
  st.caption("""
844
  <div class='footer'>
845
  © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
 
34
  )
35
  from preprocessing import text_preprocessing_pipeline
36
 
37
+ # Konfigurasi untuk chunked processing (membagi data besar menjadi bagian kecil)
38
  CHUNK_SIZE = 2500
39
  ENABLE_CHUNKED = True
40
  CACHE_EXPIRY_HOURS = 24
41
 
42
+ # Buat direktori untuk menyimpan cache file
43
  os.makedirs("chache_file", exist_ok=True)
44
  os.makedirs("chache_file/sessions", exist_ok=True)
45
 
 
57
 
58
 
59
  def get_session_id():
60
+ """Generate atau ambil session ID untuk user - tetap ada meski refresh halaman"""
61
  query_params = st.query_params
62
 
63
+ # Cek apakah session ID sudah ada di URL parameter
64
  if "sid" in query_params:
65
  sid = query_params["sid"]
66
  st.session_state.session_id = sid
67
  return sid
68
 
69
+ # Jika belum ada, buat session ID baru
70
  if "session_id" not in st.session_state:
71
  new_session_id = str(uuid.uuid4())
72
  st.session_state.session_id = new_session_id
73
  st.query_params["sid"] = new_session_id
74
  return new_session_id
75
 
76
+ # Jika sudah ada di session state, gunakan yang existing
77
  existing_id = st.session_state.session_id
78
  st.query_params["sid"] = existing_id
79
  return existing_id
80
 
81
 
82
  def get_session_cache_dir():
83
+ """Dapatkan direktori cache khusus untuk session ini"""
84
  sid = get_session_id()
85
  cache_dir = Path(f"chache_file/sessions/{sid}")
86
  cache_dir.mkdir(parents=True, exist_ok=True)
 
88
 
89
 
90
  def get_session_chunks_dir():
91
+ """Dapatkan direktori chunks khusus untuk session ini"""
92
  chunks_dir = get_session_cache_dir() / "chunks"
93
  chunks_dir.mkdir(parents=True, exist_ok=True)
94
  return chunks_dir
95
 
96
 
97
  def cleanup_old_sessions():
98
+ """Hapus cache session yang sudah expired (lebih dari 24 jam)"""
99
  sessions_dir = Path("chache_file/sessions")
100
  if not sessions_dir.exists():
101
  return
 
106
  mod_time = session_dir.stat().st_mtime
107
  age_hours = (current_time - mod_time) / 3600
108
 
109
+ # Hapus jika sudah lebih dari CACHE_EXPIRY_HOURS
110
  if age_hours > CACHE_EXPIRY_HOURS:
111
  try:
112
  shutil.rmtree(session_dir)
 
115
  print(f"Error deleting session {session_dir.name}: {e}")
116
 
117
 
118
+ # Jalankan cleanup saat aplikasi dimulai
119
  cleanup_old_sessions()
120
 
121
 
122
  @st.cache_resource(show_spinner=False)
123
  def get_model_resources():
124
+ """Memuat model dan tokenizer IndoBERT (di-cache agar tidak reload terus)"""
125
  return load_model_and_tokenizer()
126
 
127
 
128
+ # Load model dan tokenizer dengan spinner
129
  with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
130
  model, tokenizer, le, device = get_model_resources()
131
 
132
+ # Tampilkan notifikasi sukses sementara
133
  success_placeholder = st.empty()
134
  success_placeholder.success("Model dan tokenizer berhasil dimuat!")
135
  time.sleep(1)
 
137
 
138
 
139
  def convert_df_to_excel(df):
140
+ """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
141
  output = BytesIO()
142
  with pd.ExcelWriter(output, engine="openpyxl") as writer:
143
  df.to_excel(writer, index=False)
 
145
 
146
 
147
  def clear_memory():
148
+ """Bersihkan memory cache untuk optimasi performa"""
149
  gc.collect()
150
  if torch.cuda.is_available():
151
  torch.cuda.empty_cache()
 
154
  def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
155
  """
156
  Memproses satu chunk data dengan batch processing.
157
+ Progress bar menunjukkan: Preprocessing 0-100%, lalu Predicting 0-100%
158
  """
159
+ # STEP 1: Preprocessing teks (0-100%)
160
  cleaned_text_list = []
161
  total_rows = len(chunk_dataframe)
162
 
 
164
  clean_text = text_preprocessing_pipeline(str(raw_text))
165
  cleaned_text_list.append(clean_text)
166
 
167
+ # Update progress bar setiap 50 baris
168
  if idx % 50 == 0 or idx == total_rows - 1:
169
  progress = (idx + 1) / total_rows
170
  progress_bar.progress(progress)
 
177
  f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
178
  time.sleep(0.2)
179
 
180
+ # STEP 2: Batch Prediction dengan model (0-100%)
181
  batch_sz = CONFIG.get("batch_size", 32)
182
  num_sents = len(cleaned_text_list)
183
  num_asps = len(ASPEK_COLUMNS)
184
 
185
+ # Siapkan dataset dan dataloader
186
  ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
187
  tokenizer, CONFIG["max_len"])
188
  dl = DataLoader(
 
192
  num_workers=0
193
  )
194
 
195
+ # Matrix untuk menyimpan hasil prediksi
196
  predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
197
 
198
  batch_counter = 0
199
  total_batch_count = len(dl)
200
 
201
+ # Proses prediksi batch demi batch
202
  model.eval()
203
  with torch.no_grad():
204
  for batch_data in dl:
 
207
  sent_idxs = batch_data['sent_idx'].numpy()
208
  asp_idxs = batch_data['aspect_idx'].numpy()
209
 
210
+ # Prediksi dan konversi ke label
211
  model_outputs = model(inp_ids, attn_mask)
212
  probabilities = F.softmax(model_outputs, dim=1)
213
  predicted_indices = torch.argmax(
214
  probabilities, dim=1).cpu().numpy()
215
  pred_labels = le.inverse_transform(predicted_indices)
216
 
217
+ # Simpan hasil prediksi ke matrix
218
  for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
219
  predictions_matrix[s_idx][a_idx] = lbl
220
 
221
+ # Update progress bar
222
  batch_counter += 1
223
  progress = batch_counter / total_batch_count
224
  progress_bar.progress(progress)
225
  status_text.text(
226
  f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
227
 
228
+ # STEP 3: Gabungkan hasil prediksi dengan data asli
229
  result_list = []
230
  for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
231
  row_dict = data_row.to_dict()
 
236
 
237
  result_dataframe = pd.DataFrame(result_list)
238
 
239
+ # Simpan hasil chunk ke file CSV
240
  chunks_directory = get_session_chunks_dir()
241
  chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
242
  result_dataframe.to_csv(chunk_filepath, index=False)
243
 
244
+ # Progress selesai
245
  progress_bar.progress(1.0)
246
  status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
247
 
 
251
 
252
 
253
  def get_available_columns(df):
254
+ """Deteksi kolom-kolom yang tersedia dalam dataframe untuk filter dan visualisasi dinamis"""
255
  available = {
256
  'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
257
  'has_semester': 'semester' in df.columns,
 
272
  st.markdown(" ")
273
  st.markdown(" ")
274
 
275
+ # Panduan pengunaan aplikasi
276
  steps = [
277
  {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
278
  "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
 
299
  st.markdown("")
300
  st.markdown("")
301
 
302
+ # Upload file Excel
303
  uploaded_file = st.file_uploader(
304
  " Upload Data Kritik & Saran",
305
  type=["xlsx"],
306
  help="File maksimal 200MB dengan format .xlsx"
307
  )
308
 
309
+ # Tombol untuk hapus cache - KHUSUS PER SESSION
310
  session_cache_dir = get_session_cache_dir()
311
  session_result_file = session_cache_dir / "temp_predicted.csv"
312
  session_chunks_dir = get_session_chunks_dir()
313
 
314
+ # Tombol hapus cache data hasil prediksi
315
  if session_result_file.exists():
316
  if st.button("Hapus Cache Data"):
317
  session_result_file.unlink()
 
319
  time.sleep(1)
320
  st.rerun()
321
 
322
+ # Tombol hapus cache chunks
323
  if session_chunks_dir.exists():
324
  chunk_files = list(session_chunks_dir.glob("*.csv"))
325
  if chunk_files:
 
331
  time.sleep(1)
332
  st.rerun()
333
 
334
+ # Tampilkan info file yang di-cache jika ada
335
  if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
336
  if not uploaded_file:
337
  metadata_file = session_cache_dir / "metadata.txt"
 
353
  st.caption(" ")
354
 
355
 
356
+ # Inisialisasi session state untuk menyimpan hasil prediksi
357
  if "df_predicted" not in st.session_state:
358
  st.session_state.df_predicted = None
359
 
360
+ # Load dari cache jika tersedia
361
  if st.session_state.df_predicted is None and session_result_file.exists():
362
  try:
363
  df_cached = pd.read_csv(session_result_file)
 
370
  st.warning(f"Gagal memuat cache: {e}")
371
 
372
 
373
+ # Proses file yang di-upload
374
  if uploaded_file:
375
  file_bytes = uploaded_file.getvalue()
376
+ # Cek apakah ini file baru atau file yang sama
377
  if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
378
  st.session_state.last_uploaded_file = file_bytes
379
  st.session_state.uploaded_filename = uploaded_file.name
380
  try:
381
  df_uploaded = pd.read_excel(BytesIO(file_bytes))
382
 
383
+ # Konversi kolom tahun jika ada
384
  if "tahun" in df_uploaded.columns:
385
  df_uploaded["tahun"] = pd.to_numeric(
386
  df_uploaded["tahun"], errors='coerce').astype('Int64')
 
388
  except ValueError as err:
389
  st.error(f"Gagal membaca file: {err}")
390
  else:
391
+ # Validasi kolom kritik_saran wajib ada
392
  if "kritik_saran" not in df_uploaded.columns:
393
  st.error("Kolom 'kritik_saran' tidak ditemukan.")
394
  else:
395
+ # Hapus duplikasi berdasarkan kolom kritik_saran
396
  df_uploaded = df_uploaded.drop_duplicates(
397
  subset=["kritik_saran"])
398
+ # Tambahkan kolom aspek jika belum ada
399
  for aspect_col in ASPEK_COLUMNS:
400
  if aspect_col not in df_uploaded.columns:
401
  df_uploaded[aspect_col] = None
 
403
  st.markdown("### Preprocessing dan Prediksi")
404
 
405
  total_rows = len(df_uploaded)
406
+ # Tentukan apakah menggunakan chunked processing atau tidak
407
  use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
408
 
409
  if use_chunked:
410
+ # MODE CHUNKED PROCESSING untuk dataset besar
411
  num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
412
 
413
  info_col1, info_col2, info_col3 = st.columns(3)
 
426
  chunk_status_text = st.empty()
427
  overall_status = st.empty()
428
 
429
+ # Proses setiap chunk
430
  for start_idx in range(0, total_rows, CHUNK_SIZE):
431
  current_chunk_number = (start_idx // CHUNK_SIZE) + 1
432
  current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
 
435
  current_chunk_file = session_chunks_dir / \
436
  f"chunk_{current_chunk_number}.csv"
437
 
438
+ # Cek apakah chunk sudah pernah diproses (ada di cache)
439
  if current_chunk_file.exists():
440
  chunk_result = pd.read_csv(current_chunk_file)
441
  all_chunk_results.append(chunk_result)
 
454
  time.sleep(0.3)
455
  continue
456
 
457
+ # Proses chunk baru
458
  chunk_progress_bar.progress(0)
459
 
460
  chunk_result = process_chunk_batch(
 
463
  )
464
  all_chunk_results.append(chunk_result)
465
 
466
+ # Hitung estimasi waktu tersisa
467
  processed = min(start_idx + CHUNK_SIZE, total_rows)
468
  progress_pct = (processed / total_rows) * 100
469
  elapsed = time.time() - start_time
 
478
 
479
  time.sleep(0.3)
480
 
481
+ # Gabungkan semua hasil chunk
482
  chunk_status_text.empty()
483
  overall_status.info("🔄 Menggabungkan semua chunks...")
484
  df_session = pd.concat(
 
489
  duration = end_time - start_time
490
 
491
  else:
492
+ # MODE BATCH PROCESSING untuk dataset kecil
493
  st.info(
494
  f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
495
 
 
498
  progress_bar = st.progress(0)
499
  status_text = st.empty()
500
 
501
+ # STEP 1: Preprocessing
502
  cleaned_text_list = []
503
  total_preprocessing = len(df_uploaded)
504
 
 
512
  status_text.text(
513
  f"Preprocessing: {idx+1}/{total_preprocessing} rows")
514
 
515
+ # STEP 2: Prediksi
516
  progress_bar.progress(0)
517
  status_text.text("Memulai prediksi...")
518
  time.sleep(0.3)
 
556
  status_text.text(
557
  f"Predicting: {batch_counter}/{total_batch_count} batches")
558
 
559
+ # STEP 3: Gabungkan hasil
560
  result_list = []
561
  for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
562
  row_dict = data_row.to_dict()
 
576
  end_time = time.time()
577
  duration = end_time - start_time
578
 
579
+ # Simpan hasil ke session state dan cache file
580
  st.session_state.df_predicted = df_session
581
  df_session.to_csv(session_result_file, index=False)
582
 
583
+ # Simpan metadata nama file
584
  metadata_file = session_cache_dir / "metadata.txt"
585
  with open(metadata_file, "w", encoding="utf-8") as f:
586
  f.write(uploaded_file.name)
587
 
588
+ # Hitung performa processing
589
  total_items = total_rows * len(ASPEK_COLUMNS)
590
  items_per_second = total_items / duration if duration > 0 else 0
591
 
592
+ # Tampilkan ringkasan hasil processing
593
  if use_chunked:
594
  st.success(
595
  f"✅ **Chunked + Batch Processing selesai!**\n\n"
 
609
  f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
610
  )
611
 
612
+ # Tampilan hasil prediksi dan visualisasi
613
  if st.session_state.df_predicted is not None:
614
  df_predicted = st.session_state.df_predicted
615
 
616
+ # Deteksi kolom yang tersedia untuk filter dinamis
617
  available_cols = get_available_columns(df_predicted)
618
 
619
  # Sidebar filter dengan pengecekan kolom dinamis
 
628
  st.sidebar.info(
629
  "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
630
 
631
+ # Filter Mata Kuliah (jika kolom tersedia)
632
  selected_matkul = []
633
  if available_cols['has_matkul']:
634
  matkul_options = sorted(
 
637
  selected_matkul = st.sidebar.multiselect(
638
  "Nama Mata Kuliah", matkul_options, default=matkul_options)
639
 
640
+ # Filter Program Studi (jika kolom tersedia)
641
  selected_prodi = []
642
  if available_cols['has_prodi']:
643
  prodi_options = sorted(
 
646
  selected_prodi = st.sidebar.multiselect(
647
  "Program Studi", prodi_options, default=prodi_options)
648
 
649
+ # Filter Tahun (jika kolom tersedia)
650
  selected_tahun = []
651
  if available_cols['has_tahun']:
652
  if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
 
659
  selected_tahun = st.sidebar.multiselect(
660
  "Tahun", tahun_options, default=tahun_options)
661
 
662
+ # Filter Semester (jika kolom tersedia)
663
  selected_semester = []
664
  if available_cols['has_semester']:
665
  semester_options = sorted(
 
668
  selected_semester = st.sidebar.multiselect(
669
  "Semester", semester_options, default=semester_options)
670
 
671
+ # Terapkan semua filter yang dipilih
672
  df_filtered = df_clean.copy()
673
 
674
  if selected_matkul and available_cols['has_matkul']:
 
690
  st.markdown("### Tabel Data Hasil Prediksi")
691
  st.dataframe(df_filtered, width='stretch')
692
 
693
+ # Tombol download untuk data terfilter dan semua data
694
  col_dl1, col_dl2 = st.columns(2)
695
  with col_dl1:
696
  st.download_button(
 
719
  st.markdown("### Ringkasan Cepat")
720
  st.markdown("")
721
 
722
+ # Hitung total sentimen dari semua aspek
723
  total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
724
  total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
725
  total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
726
 
727
+ # Tentukan kolom ringkasan berdasarkan data yang tersedia
728
  summary_cols = []
729
 
730
  # Kolom dasar (selalu ada)
731
  summary_cols.extend(['ulasan', 'aspek'])
732
 
733
+ # Kolom opsional berdasarkan ketersediaan data
734
  if available_cols['has_matkul']:
735
  summary_cols.append('matkul')
736
  if available_cols['has_prodi']:
 
738
  if available_cols['has_semester']:
739
  summary_cols.append('semester')
740
 
741
+ # Buat kolom dinamis untuk menampilkan metrik
742
  num_cols = len(summary_cols)
743
  cols = st.columns(num_cols)
744
 
745
  col_idx = 0
746
 
747
+ # Metrik dasar: Jumlah Ulasan & Aspek
748
  cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
749
  col_idx += 1
750
  cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
751
  col_idx += 1
752
 
753
+ # Metrik Mata Kuliah (jika tersedia)
754
  if available_cols['has_matkul']:
755
  matkul_count = df_filtered['nama_matakuliah'].nunique()
756
  cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
757
  col_idx += 1
758
 
759
+ # Metrik Prodi (jika tersedia)
760
  if available_cols['has_prodi']:
761
  prodi_count = df_filtered['nama_prodi'].nunique()
762
  cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
763
  col_idx += 1
764
 
765
+ # Metrik Semester (jika tersedia)
766
  if available_cols['has_semester']:
767
  semester_count = df_filtered['semester'].nunique()
768
  cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
 
770
 
771
  st.markdown("")
772
 
773
+ # Baris kedua: Metrik Sentimen dan info tambahan
774
  summary_cols2 = ['positif', 'netral', 'negatif']
775
 
776
  if available_cols['has_tahun']:
 
781
  cols2 = st.columns(len(summary_cols2))
782
 
783
  col_idx2 = 0
784
+ # Metrik untuk masing-masing jenis sentimen
785
  cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
786
  col_idx2 += 1
787
  cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
 
789
  cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
790
  col_idx2 += 1
791
 
792
+ # Metrik Rentang Tahun (jika tersedia)
793
  if available_cols['has_tahun']:
794
  if 'tahun' in df_filtered.columns:
795
  tahun_valid = df_filtered['tahun'].dropna()
 
807
  cols2[col_idx2].metric("Rentang Tahun", "N/A")
808
  col_idx2 += 1
809
 
810
+ # Metrik Rata-rata Panjang Kata (jika tersedia)
811
  if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
812
  try:
813
  word_counts = df_filtered['kritik_saran'].astype(
 
828
  with col2:
829
  show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
830
 
831
+ # Visualisasi distribusi berdasarkan kolom yang tersedia
832
  viz_shown = False
833
 
834
+ # Visualisasi Tahun dan Semester (jika tersedia)
835
  if available_cols['has_tahun'] or available_cols['has_semester']:
836
  col1, col2 = st.columns(2)
837
  with col1:
 
845
  if result:
846
  viz_shown = True
847
 
848
+ # Visualisasi Program Studi (jika tersedia)
849
  if available_cols['has_prodi']:
850
  st.markdown("---")
851
  result = show_prodi_distribution(df_filtered)
852
  if result:
853
  viz_shown = True
854
 
855
+ # Visualisasi Top 10 Mata Kuliah (jika tersedia)
856
  if available_cols['has_matkul']:
857
  st.markdown("---")
858
  result = show_top10_matkul_distribution(df_filtered)
859
  if result:
860
  viz_shown = True
861
 
862
+ # Visualisasi Sentimen per Tahun/Semester (jika tersedia)
863
  if available_cols['has_tahun'] or available_cols['has_semester']:
864
  st.markdown("---")
865
  col1, col2 = st.columns(2)
 
874
  if result:
875
  viz_shown = True
876
 
877
+ # Visualisasi Sentimen per Program Studi (jika tersedia)
878
  if available_cols['has_prodi']:
879
  st.markdown("---")
880
  result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
881
  if result:
882
  viz_shown = True
883
 
884
+ # Visualisasi Sentimen per Top 10 Mata Kuliah (jika tersedia)
885
  if available_cols['has_matkul']:
886
  st.markdown("---")
887
  result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
888
  if result:
889
  viz_shown = True
890
 
891
+ # Footer aplikasi
892
  st.caption("""
893
  <div class='footer'>
894
  © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit
visualization.py CHANGED
@@ -13,12 +13,13 @@ import plotly.express as px
13
  from config import ASPEK_COLUMNS
14
 
15
 
16
- # Palet warna kustom
17
  sentimen_palette = {
18
  "netral": "#FFE24C",
19
  "positif": "#4CFF72",
20
  "negatif": "#FF4C4C"
21
  }
 
22
  category_order = ["netral", "positif", "negatif"]
23
 
24
  # Konfigurasi Plotly
@@ -30,20 +31,24 @@ config_options = {
30
 
31
  def show_sentiment_bar_chart(df_predicted, aspek_columns):
32
  """Menampilkan bar chart distribusi sentimen per aspek."""
 
33
  if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
34
  st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
35
  return
36
 
 
37
  df_long = df_predicted.melt(
38
  value_vars=aspek_columns,
39
  var_name="aspek",
40
  value_name="sentimen"
41
  )
 
42
  df_long["sentimen"] = pd.Categorical(
43
  df_long["sentimen"],
44
  categories=category_order,
45
  ordered=True
46
  )
 
47
  count_data = df_long.groupby(
48
  ["aspek", "sentimen"], observed=False
49
  ).size().reset_index(name="jumlah")
@@ -62,10 +67,12 @@ def show_sentiment_bar_chart(df_predicted, aspek_columns):
62
 
63
  def show_sentiment_pie_chart(df_predicted, aspek_columns):
64
  """Menampilkan pie chart distribusi total sentimen."""
 
65
  sentimen_total = df_predicted[aspek_columns].values.ravel()
66
  sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
67
  sentimen_counts.columns = ["sentimen", "jumlah"]
68
  sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
 
69
  fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
70
  color="sentimen", color_discrete_map=sentimen_palette,
71
  hole=0.3)
@@ -76,12 +83,13 @@ def show_sentiment_pie_chart(df_predicted, aspek_columns):
76
 
77
  def show_year_distribution(df):
78
  """Menampilkan distribusi jumlah kritik/saran per tahun."""
79
- # Coba ekstrak dari kolom tanggal jika ada
80
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
81
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
82
 
 
83
  if 'tahun' not in df.columns:
84
- return None # Return None jika tidak ada kolom tahun
85
 
86
  df_tahun = df.dropna(subset=['tahun']).copy()
87
  if df_tahun.empty:
@@ -121,6 +129,7 @@ def show_prodi_distribution(df):
121
 
122
  prodi_counts = df['nama_prodi'].value_counts().reset_index()
123
  prodi_counts.columns = ['nama_prodi', 'jumlah']
 
124
  prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
125
  fig = px.bar(
126
  prodi_counts,
@@ -142,6 +151,7 @@ def show_top10_matkul_distribution(df):
142
  if missing_cols:
143
  return None
144
 
 
145
  matkul_counts = (
146
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
147
  .size()
@@ -149,6 +159,7 @@ def show_top10_matkul_distribution(df):
149
  .sort_values(by='jumlah', ascending=False)
150
  .head(10)
151
  )
 
152
  matkul_counts['label'] = (
153
  matkul_counts['kode_matakuliah'] + " - " +
154
  matkul_counts['nama_matakuliah']
@@ -169,13 +180,14 @@ def show_top10_matkul_distribution(df):
169
 
170
  def show_sentiment_by_year(df, aspek_columns):
171
  """Menampilkan distribusi sentimen per tahun."""
172
- # Coba ekstrak dari kolom tanggal jika ada
173
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
174
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
175
 
176
  if 'tahun' not in df.columns:
177
  return None
178
 
 
179
  df_long = df.melt(id_vars=['tahun'],
180
  value_vars=aspek_columns,
181
  var_name='aspek',
@@ -230,13 +242,16 @@ def show_sentiment_by_prodi(df, aspek_columns):
230
  .reset_index(name='jumlah')
231
  )
232
 
 
233
  total_per_prodi = (
234
  prodi_sentiment.groupby('nama_prodi')['jumlah']
235
  .sum()
236
  .sort_values(ascending=False)
237
  )
 
238
  ordered_categories = total_per_prodi.index.tolist()[::-1]
239
 
 
240
  prodi_sentiment['nama_prodi'] = pd.Categorical(
241
  prodi_sentiment['nama_prodi'],
242
  categories=ordered_categories,
@@ -269,6 +284,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
269
  if missing_cols:
270
  return None
271
 
 
272
  df_top10 = (
273
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
274
  .size()
@@ -287,6 +303,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
287
  value_name='sentimen'
288
  )
289
 
 
290
  df_long['label'] = (
291
  df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
292
  )
@@ -297,6 +314,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
297
  .reset_index(name='jumlah')
298
  )
299
 
 
300
  total_per_label = (
301
  matkul_sentiment.groupby('label')['jumlah']
302
  .sum()
 
13
  from config import ASPEK_COLUMNS
14
 
15
 
16
+ # Definisi warna untuk setiap kategori sentimen
17
  sentimen_palette = {
18
  "netral": "#FFE24C",
19
  "positif": "#4CFF72",
20
  "negatif": "#FF4C4C"
21
  }
22
+ # Urutan kategori untuk konsistensi tampilan di semua chart
23
  category_order = ["netral", "positif", "negatif"]
24
 
25
  # Konfigurasi Plotly
 
31
 
32
  def show_sentiment_bar_chart(df_predicted, aspek_columns):
33
  """Menampilkan bar chart distribusi sentimen per aspek."""
34
+ # Validasi data dan kolom yang diperlukan
35
  if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
36
  st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
37
  return
38
 
39
+ # Transformasi dari wide ke long format untuk visualisasi
40
  df_long = df_predicted.melt(
41
  value_vars=aspek_columns,
42
  var_name="aspek",
43
  value_name="sentimen"
44
  )
45
+ # Konversi ke categorical untuk memastikan urutan yang konsisten
46
  df_long["sentimen"] = pd.Categorical(
47
  df_long["sentimen"],
48
  categories=category_order,
49
  ordered=True
50
  )
51
+ # Agregasi data untuk menghitung jumlah per aspek dan sentimen
52
  count_data = df_long.groupby(
53
  ["aspek", "sentimen"], observed=False
54
  ).size().reset_index(name="jumlah")
 
67
 
68
  def show_sentiment_pie_chart(df_predicted, aspek_columns):
69
  """Menampilkan pie chart distribusi total sentimen."""
70
+ # Flatten semua nilai sentimen dari semua aspek menjadi 1D array
71
  sentimen_total = df_predicted[aspek_columns].values.ravel()
72
  sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
73
  sentimen_counts.columns = ["sentimen", "jumlah"]
74
  sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
75
+ # Donut chart dengan hole parameter
76
  fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
77
  color="sentimen", color_discrete_map=sentimen_palette,
78
  hole=0.3)
 
83
 
84
  def show_year_distribution(df):
85
  """Menampilkan distribusi jumlah kritik/saran per tahun."""
86
+ # Ekstraksi tahun dari kolom tanggal jika kolom tahun tidak tersedia
87
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
88
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
89
 
90
+ # Return None jika tidak ada data tahun (untuk handling di pemanggil)
91
  if 'tahun' not in df.columns:
92
+ return None
93
 
94
  df_tahun = df.dropna(subset=['tahun']).copy()
95
  if df_tahun.empty:
 
129
 
130
  prodi_counts = df['nama_prodi'].value_counts().reset_index()
131
  prodi_counts.columns = ['nama_prodi', 'jumlah']
132
+ # Sort ascending untuk horizontal bar (nilai kecil di bawah)
133
  prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
134
  fig = px.bar(
135
  prodi_counts,
 
151
  if missing_cols:
152
  return None
153
 
154
+ # Groupby untuk menghitung frekuensi per mata kuliah
155
  matkul_counts = (
156
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
157
  .size()
 
159
  .sort_values(by='jumlah', ascending=False)
160
  .head(10)
161
  )
162
+ # Gabungkan kode dan nama untuk label yang informatif
163
  matkul_counts['label'] = (
164
  matkul_counts['kode_matakuliah'] + " - " +
165
  matkul_counts['nama_matakuliah']
 
180
 
181
  def show_sentiment_by_year(df, aspek_columns):
182
  """Menampilkan distribusi sentimen per tahun."""
183
+ # Ekstraksi tahun dari kolom tanggal jika diperlukan
184
  if 'tanggal' in df.columns and 'tahun' not in df.columns:
185
  df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
186
 
187
  if 'tahun' not in df.columns:
188
  return None
189
 
190
+ # Transformasi ke long format dengan id_vars tahun
191
  df_long = df.melt(id_vars=['tahun'],
192
  value_vars=aspek_columns,
193
  var_name='aspek',
 
242
  .reset_index(name='jumlah')
243
  )
244
 
245
+ # Hitung total per prodi untuk mengurutkan dari terbanyak ke sedikit
246
  total_per_prodi = (
247
  prodi_sentiment.groupby('nama_prodi')['jumlah']
248
  .sum()
249
  .sort_values(ascending=False)
250
  )
251
+ # Reverse order untuk horizontal bar (nilai besar di atas)
252
  ordered_categories = total_per_prodi.index.tolist()[::-1]
253
 
254
+ # Konversi ke categorical untuk kontrol urutan tampilan
255
  prodi_sentiment['nama_prodi'] = pd.Categorical(
256
  prodi_sentiment['nama_prodi'],
257
  categories=ordered_categories,
 
284
  if missing_cols:
285
  return None
286
 
287
+ # Filter top 10 mata kuliah berdasarkan frekuensi
288
  df_top10 = (
289
  df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
290
  .size()
 
303
  value_name='sentimen'
304
  )
305
 
306
+ # Gabungkan kode dan nama untuk label
307
  df_long['label'] = (
308
  df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
309
  )
 
314
  .reset_index(name='jumlah')
315
  )
316
 
317
+ # Urutkan berdasarkan total sentimen per mata kuliah
318
  total_per_label = (
319
  matkul_sentiment.groupby('label')['jumlah']
320
  .sum()