Spaces:

zdannn2808
/

absa-indobert-web

Running

App Files Files Community

zdannn2808 commited on Oct 23

Commit

aff4068

verified ·

1 Parent(s): 77a2742

add some comments

Browse files

Files changed (2) hide show

app.py +87 -38
visualization.py +22 -4

app.py CHANGED Viewed

@@ -34,11 +34,12 @@ from visualization import (
 )
 from preprocessing import text_preprocessing_pipeline
-# Konfigurasi untuk chunked processing
 CHUNK_SIZE = 2500
 ENABLE_CHUNKED = True
 CACHE_EXPIRY_HOURS = 24
 os.makedirs("chache_file", exist_ok=True)
 os.makedirs("chache_file/sessions", exist_ok=True)
@@ -56,27 +57,30 @@ st.markdown('<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.3/fon
 def get_session_id():
-    """Generate atau retrieve session ID untuk user - PERSISTENT across refresh"""
     query_params = st.query_params
     if "sid" in query_params:
         sid = query_params["sid"]
         st.session_state.session_id = sid
         return sid
     if "session_id" not in st.session_state:
         new_session_id = str(uuid.uuid4())
         st.session_state.session_id = new_session_id
         st.query_params["sid"] = new_session_id
         return new_session_id
     existing_id = st.session_state.session_id
     st.query_params["sid"] = existing_id
     return existing_id
 def get_session_cache_dir():
-    """Get direktori cache untuk session ini"""
     sid = get_session_id()
     cache_dir = Path(f"chache_file/sessions/{sid}")
     cache_dir.mkdir(parents=True, exist_ok=True)
@@ -84,14 +88,14 @@ def get_session_cache_dir():
 def get_session_chunks_dir():
-    """Get direktori chunks untuk session ini"""
     chunks_dir = get_session_cache_dir() / "chunks"
     chunks_dir.mkdir(parents=True, exist_ok=True)
     return chunks_dir
 def cleanup_old_sessions():
-    """Hapus session cache yang sudah expired (> 24 jam)"""
     sessions_dir = Path("chache_file/sessions")
     if not sessions_dir.exists():
         return
@@ -102,6 +106,7 @@ def cleanup_old_sessions():
             mod_time = session_dir.stat().st_mtime
             age_hours = (current_time - mod_time) / 3600
             if age_hours > CACHE_EXPIRY_HOURS:
                 try:
                     shutil.rmtree(session_dir)
@@ -110,18 +115,21 @@ def cleanup_old_sessions():
                     print(f"Error deleting session {session_dir.name}: {e}")
 cleanup_old_sessions()
 @st.cache_resource(show_spinner=False)
 def get_model_resources():
-    """Memuat model dan tokenizer IndoBERT."""
     return load_model_and_tokenizer()
 with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
     model, tokenizer, le, device = get_model_resources()
 success_placeholder = st.empty()
 success_placeholder.success("Model dan tokenizer berhasil dimuat!")
 time.sleep(1)
@@ -129,7 +137,7 @@ success_placeholder.empty()
 def convert_df_to_excel(df):
-    """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream."""
     output = BytesIO()
     with pd.ExcelWriter(output, engine="openpyxl") as writer:
         df.to_excel(writer, index=False)
@@ -137,7 +145,7 @@ def convert_df_to_excel(df):
 def clear_memory():
-    """Clear memory cache"""
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
@@ -146,9 +154,9 @@ def clear_memory():
 def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
     """
     Memproses satu chunk data dengan batch processing.
-    Progress bar: Preprocessing 0-100%, lalu Predicting 0-100%
     """
-    # STEP 1: Preprocessing (0-100%)
     cleaned_text_list = []
     total_rows = len(chunk_dataframe)
@@ -156,6 +164,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
         clean_text = text_preprocessing_pipeline(str(raw_text))
         cleaned_text_list.append(clean_text)
         if idx % 50 == 0 or idx == total_rows - 1:
             progress = (idx + 1) / total_rows
             progress_bar.progress(progress)
@@ -168,11 +177,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
         f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
     time.sleep(0.2)
-    # STEP 2: Batch Prediction (0-100%)
     batch_sz = CONFIG.get("batch_size", 32)
     num_sents = len(cleaned_text_list)
     num_asps = len(ASPEK_COLUMNS)
     ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
                      tokenizer, CONFIG["max_len"])
     dl = DataLoader(
@@ -182,11 +192,13 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
         num_workers=0
     )
     predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
     batch_counter = 0
     total_batch_count = len(dl)
     model.eval()
     with torch.no_grad():
         for batch_data in dl:
@@ -195,22 +207,25 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
             sent_idxs = batch_data['sent_idx'].numpy()
             asp_idxs = batch_data['aspect_idx'].numpy()
             model_outputs = model(inp_ids, attn_mask)
             probabilities = F.softmax(model_outputs, dim=1)
             predicted_indices = torch.argmax(
                 probabilities, dim=1).cpu().numpy()
             pred_labels = le.inverse_transform(predicted_indices)
             for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
                 predictions_matrix[s_idx][a_idx] = lbl
             batch_counter += 1
             progress = batch_counter / total_batch_count
             progress_bar.progress(progress)
             status_text.text(
                 f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
-    # STEP 3: Combine results
     result_list = []
     for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
         row_dict = data_row.to_dict()
@@ -221,11 +236,12 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
     result_dataframe = pd.DataFrame(result_list)
     chunks_directory = get_session_chunks_dir()
     chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
     result_dataframe.to_csv(chunk_filepath, index=False)
-    # Complete progress
     progress_bar.progress(1.0)
     status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
@@ -235,7 +251,7 @@ def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_
 def get_available_columns(df):
-    """Deteksi kolom-kolom yang tersedia dalam dataframe"""
     available = {
         'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
         'has_semester': 'semester' in df.columns,
@@ -256,7 +272,7 @@ st.markdown(" ")
 st.markdown(" ")
 st.markdown(" ")
-# Panduan pengunaan
 steps = [
     {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
         "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
@@ -283,18 +299,19 @@ for i, step in enumerate(steps):
 st.markdown("")
 st.markdown("")
-# Upload file
 uploaded_file = st.file_uploader(
     " Upload Data Kritik & Saran",
     type=["xlsx"],
     help="File maksimal 200MB dengan format .xlsx"
 )
-# Clear cache buttons - SESSION SPECIFIC
 session_cache_dir = get_session_cache_dir()
 session_result_file = session_cache_dir / "temp_predicted.csv"
 session_chunks_dir = get_session_chunks_dir()
 if session_result_file.exists():
     if st.button("Hapus Cache Data"):
         session_result_file.unlink()
@@ -302,6 +319,7 @@ if session_result_file.exists():
         time.sleep(1)
         st.rerun()
 if session_chunks_dir.exists():
     chunk_files = list(session_chunks_dir.glob("*.csv"))
     if chunk_files:
@@ -313,6 +331,7 @@ if session_chunks_dir.exists():
             time.sleep(1)
             st.rerun()
 if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
     if not uploaded_file:
         metadata_file = session_cache_dir / "metadata.txt"
@@ -334,9 +353,11 @@ if session_result_file.exists() or (session_chunks_dir.exists() and list(session
         st.caption(" ")
 if "df_predicted" not in st.session_state:
     st.session_state.df_predicted = None
 if st.session_state.df_predicted is None and session_result_file.exists():
     try:
         df_cached = pd.read_csv(session_result_file)
@@ -349,14 +370,17 @@ if st.session_state.df_predicted is None and session_result_file.exists():
         st.warning(f"Gagal memuat cache: {e}")
 if uploaded_file:
     file_bytes = uploaded_file.getvalue()
     if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
         st.session_state.last_uploaded_file = file_bytes
         st.session_state.uploaded_filename = uploaded_file.name
         try:
             df_uploaded = pd.read_excel(BytesIO(file_bytes))
             if "tahun" in df_uploaded.columns:
                 df_uploaded["tahun"] = pd.to_numeric(
                     df_uploaded["tahun"], errors='coerce').astype('Int64')
@@ -364,11 +388,14 @@ if uploaded_file:
         except ValueError as err:
             st.error(f"Gagal membaca file: {err}")
         else:
             if "kritik_saran" not in df_uploaded.columns:
                 st.error("Kolom 'kritik_saran' tidak ditemukan.")
             else:
                 df_uploaded = df_uploaded.drop_duplicates(
                     subset=["kritik_saran"])
                 for aspect_col in ASPEK_COLUMNS:
                     if aspect_col not in df_uploaded.columns:
                         df_uploaded[aspect_col] = None
@@ -376,9 +403,11 @@ if uploaded_file:
                 st.markdown("### Preprocessing dan Prediksi")
                 total_rows = len(df_uploaded)
                 use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
                 if use_chunked:
                     num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
                     info_col1, info_col2, info_col3 = st.columns(3)
@@ -397,6 +426,7 @@ if uploaded_file:
                     chunk_status_text = st.empty()
                     overall_status = st.empty()
                     for start_idx in range(0, total_rows, CHUNK_SIZE):
                         current_chunk_number = (start_idx // CHUNK_SIZE) + 1
                         current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
@@ -405,6 +435,7 @@ if uploaded_file:
                         current_chunk_file = session_chunks_dir / \
                             f"chunk_{current_chunk_number}.csv"
                         if current_chunk_file.exists():
                             chunk_result = pd.read_csv(current_chunk_file)
                             all_chunk_results.append(chunk_result)
@@ -423,6 +454,7 @@ if uploaded_file:
                             time.sleep(0.3)
                             continue
                         chunk_progress_bar.progress(0)
                         chunk_result = process_chunk_batch(
@@ -431,6 +463,7 @@ if uploaded_file:
                         )
                         all_chunk_results.append(chunk_result)
                         processed = min(start_idx + CHUNK_SIZE, total_rows)
                         progress_pct = (processed / total_rows) * 100
                         elapsed = time.time() - start_time
@@ -445,6 +478,7 @@ if uploaded_file:
                         time.sleep(0.3)
                     chunk_status_text.empty()
                     overall_status.info("🔄 Menggabungkan semua chunks...")
                     df_session = pd.concat(
@@ -455,6 +489,7 @@ if uploaded_file:
                     duration = end_time - start_time
                 else:
                     st.info(
                         f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
@@ -463,6 +498,7 @@ if uploaded_file:
                     progress_bar = st.progress(0)
                     status_text = st.empty()
                     cleaned_text_list = []
                     total_preprocessing = len(df_uploaded)
@@ -476,6 +512,7 @@ if uploaded_file:
                             status_text.text(
                                 f"Preprocessing: {idx+1}/{total_preprocessing} rows")
                     progress_bar.progress(0)
                     status_text.text("Memulai prediksi...")
                     time.sleep(0.3)
@@ -519,6 +556,7 @@ if uploaded_file:
                             status_text.text(
                                 f"Predicting: {batch_counter}/{total_batch_count} batches")
                     result_list = []
                     for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
                         row_dict = data_row.to_dict()
@@ -538,16 +576,20 @@ if uploaded_file:
                     end_time = time.time()
                     duration = end_time - start_time
                 st.session_state.df_predicted = df_session
                 df_session.to_csv(session_result_file, index=False)
                 metadata_file = session_cache_dir / "metadata.txt"
                 with open(metadata_file, "w", encoding="utf-8") as f:
                     f.write(uploaded_file.name)
                 total_items = total_rows * len(ASPEK_COLUMNS)
                 items_per_second = total_items / duration if duration > 0 else 0
                 if use_chunked:
                     st.success(
                         f"✅ **Chunked + Batch Processing selesai!**\n\n"
@@ -567,11 +609,11 @@ if uploaded_file:
                         f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
                     )
-# Setelah prediksi selesai
 if st.session_state.df_predicted is not None:
     df_predicted = st.session_state.df_predicted
-    # Deteksi kolom yang tersedia
     available_cols = get_available_columns(df_predicted)
     # Sidebar filter dengan pengecekan kolom dinamis
@@ -586,7 +628,7 @@ if st.session_state.df_predicted is not None:
         st.sidebar.info(
             "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
-    # Filter Mata Kuliah
     selected_matkul = []
     if available_cols['has_matkul']:
         matkul_options = sorted(
@@ -595,7 +637,7 @@ if st.session_state.df_predicted is not None:
             selected_matkul = st.sidebar.multiselect(
                 "Nama Mata Kuliah", matkul_options, default=matkul_options)
-    # Filter Program Studi
     selected_prodi = []
     if available_cols['has_prodi']:
         prodi_options = sorted(
@@ -604,7 +646,7 @@ if st.session_state.df_predicted is not None:
             selected_prodi = st.sidebar.multiselect(
                 "Program Studi", prodi_options, default=prodi_options)
-    # Filter Tahun
     selected_tahun = []
     if available_cols['has_tahun']:
         if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
@@ -617,7 +659,7 @@ if st.session_state.df_predicted is not None:
                 selected_tahun = st.sidebar.multiselect(
                     "Tahun", tahun_options, default=tahun_options)
-    # Filter Semester
     selected_semester = []
     if available_cols['has_semester']:
         semester_options = sorted(
@@ -626,7 +668,7 @@ if st.session_state.df_predicted is not None:
             selected_semester = st.sidebar.multiselect(
                 "Semester", semester_options, default=semester_options)
-    # Apply filters
     df_filtered = df_clean.copy()
     if selected_matkul and available_cols['has_matkul']:
@@ -648,7 +690,7 @@ if st.session_state.df_predicted is not None:
     st.markdown("### Tabel Data Hasil Prediksi")
     st.dataframe(df_filtered, width='stretch')
-    # Download buttons
     col_dl1, col_dl2 = st.columns(2)
     with col_dl1:
         st.download_button(
@@ -677,17 +719,18 @@ if st.session_state.df_predicted is not None:
     st.markdown("### Ringkasan Cepat")
     st.markdown("")
     total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
     total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
     total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
-    # Hitung jumlah kolom yang tersedia untuk ringkasan
     summary_cols = []
     # Kolom dasar (selalu ada)
     summary_cols.extend(['ulasan', 'aspek'])
-    # Kolom opsional
     if available_cols['has_matkul']:
         summary_cols.append('matkul')
     if available_cols['has_prodi']:
@@ -695,31 +738,31 @@ if st.session_state.df_predicted is not None:
     if available_cols['has_semester']:
         summary_cols.append('semester')
-    # Buat kolom dinamis berdasarkan data yang tersedia
     num_cols = len(summary_cols)
     cols = st.columns(num_cols)
     col_idx = 0
-    # Ulasan & Aspek (selalu ada)
     cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
     col_idx += 1
     cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
     col_idx += 1
-    # Mata Kuliah (jika ada)
     if available_cols['has_matkul']:
         matkul_count = df_filtered['nama_matakuliah'].nunique()
         cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
         col_idx += 1
-    # Prodi (jika ada)
     if available_cols['has_prodi']:
         prodi_count = df_filtered['nama_prodi'].nunique()
         cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
         col_idx += 1
-    # Semester (jika ada)
     if available_cols['has_semester']:
         semester_count = df_filtered['semester'].nunique()
         cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
@@ -727,7 +770,7 @@ if st.session_state.df_predicted is not None:
     st.markdown("")
-    # Baris kedua: Sentimen + info tambahan
     summary_cols2 = ['positif', 'netral', 'negatif']
     if available_cols['has_tahun']:
@@ -738,6 +781,7 @@ if st.session_state.df_predicted is not None:
     cols2 = st.columns(len(summary_cols2))
     col_idx2 = 0
     cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
     col_idx2 += 1
     cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
@@ -745,7 +789,7 @@ if st.session_state.df_predicted is not None:
     cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
     col_idx2 += 1
-    # Rentang tahun (jika ada)
     if available_cols['has_tahun']:
         if 'tahun' in df_filtered.columns:
             tahun_valid = df_filtered['tahun'].dropna()
@@ -763,7 +807,7 @@ if st.session_state.df_predicted is not None:
             cols2[col_idx2].metric("Rentang Tahun", "N/A")
         col_idx2 += 1
-    # Rata-rata panjang kata (jika ada)
     if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
         try:
             word_counts = df_filtered['kritik_saran'].astype(
@@ -784,9 +828,10 @@ if st.session_state.df_predicted is not None:
     with col2:
         show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
-    # Visualisasi berdasarkan kolom yang tersedia
     viz_shown = False
     if available_cols['has_tahun'] or available_cols['has_semester']:
         col1, col2 = st.columns(2)
         with col1:
@@ -800,19 +845,21 @@ if st.session_state.df_predicted is not None:
                 if result:
                     viz_shown = True
     if available_cols['has_prodi']:
         st.markdown("---")
         result = show_prodi_distribution(df_filtered)
         if result:
             viz_shown = True
     if available_cols['has_matkul']:
         st.markdown("---")
         result = show_top10_matkul_distribution(df_filtered)
         if result:
             viz_shown = True
-    # Sentimen per tahun/semester
     if available_cols['has_tahun'] or available_cols['has_semester']:
         st.markdown("---")
         col1, col2 = st.columns(2)
@@ -827,19 +874,21 @@ if st.session_state.df_predicted is not None:
                 if result:
                     viz_shown = True
     if available_cols['has_prodi']:
         st.markdown("---")
         result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
         if result:
             viz_shown = True
     if available_cols['has_matkul']:
         st.markdown("---")
         result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
         if result:
             viz_shown = True
-# Footer
 st.caption("""
     <div class='footer'>
         © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit

 )
 from preprocessing import text_preprocessing_pipeline
+# Konfigurasi untuk chunked processing (membagi data besar menjadi bagian kecil)
 CHUNK_SIZE = 2500
 ENABLE_CHUNKED = True
 CACHE_EXPIRY_HOURS = 24
+# Buat direktori untuk menyimpan cache file
 os.makedirs("chache_file", exist_ok=True)
 os.makedirs("chache_file/sessions", exist_ok=True)
 def get_session_id():
+    """Generate atau ambil session ID untuk user - tetap ada meski refresh halaman"""
     query_params = st.query_params
+    # Cek apakah session ID sudah ada di URL parameter
     if "sid" in query_params:
         sid = query_params["sid"]
         st.session_state.session_id = sid
         return sid
+    # Jika belum ada, buat session ID baru
     if "session_id" not in st.session_state:
         new_session_id = str(uuid.uuid4())
         st.session_state.session_id = new_session_id
         st.query_params["sid"] = new_session_id
         return new_session_id
+    # Jika sudah ada di session state, gunakan yang existing
     existing_id = st.session_state.session_id
     st.query_params["sid"] = existing_id
     return existing_id
 def get_session_cache_dir():
+    """Dapatkan direktori cache khusus untuk session ini"""
     sid = get_session_id()
     cache_dir = Path(f"chache_file/sessions/{sid}")
     cache_dir.mkdir(parents=True, exist_ok=True)
 def get_session_chunks_dir():
+    """Dapatkan direktori chunks khusus untuk session ini"""
     chunks_dir = get_session_cache_dir() / "chunks"
     chunks_dir.mkdir(parents=True, exist_ok=True)
     return chunks_dir
 def cleanup_old_sessions():
+    """Hapus cache session yang sudah expired (lebih dari 24 jam)"""
     sessions_dir = Path("chache_file/sessions")
     if not sessions_dir.exists():
         return
             mod_time = session_dir.stat().st_mtime
             age_hours = (current_time - mod_time) / 3600
+            # Hapus jika sudah lebih dari CACHE_EXPIRY_HOURS
             if age_hours > CACHE_EXPIRY_HOURS:
                 try:
                     shutil.rmtree(session_dir)
                     print(f"Error deleting session {session_dir.name}: {e}")
+# Jalankan cleanup saat aplikasi dimulai
 cleanup_old_sessions()
 @st.cache_resource(show_spinner=False)
 def get_model_resources():
+    """Memuat model dan tokenizer IndoBERT (di-cache agar tidak reload terus)"""
     return load_model_and_tokenizer()
+# Load model dan tokenizer dengan spinner
 with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
     model, tokenizer, le, device = get_model_resources()
+# Tampilkan notifikasi sukses sementara
 success_placeholder = st.empty()
 success_placeholder.success("Model dan tokenizer berhasil dimuat!")
 time.sleep(1)
 def convert_df_to_excel(df):
+    """Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
     output = BytesIO()
     with pd.ExcelWriter(output, engine="openpyxl") as writer:
         df.to_excel(writer, index=False)
 def clear_memory():
+    """Bersihkan memory cache untuk optimasi performa"""
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
 def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
     """
     Memproses satu chunk data dengan batch processing.
+    Progress bar menunjukkan: Preprocessing 0-100%, lalu Predicting 0-100%
     """
+    # STEP 1: Preprocessing teks (0-100%)
     cleaned_text_list = []
     total_rows = len(chunk_dataframe)
         clean_text = text_preprocessing_pipeline(str(raw_text))
         cleaned_text_list.append(clean_text)
+        # Update progress bar setiap 50 baris
         if idx % 50 == 0 or idx == total_rows - 1:
             progress = (idx + 1) / total_rows
             progress_bar.progress(progress)
         f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
     time.sleep(0.2)
+    # STEP 2: Batch Prediction dengan model (0-100%)
     batch_sz = CONFIG.get("batch_size", 32)
     num_sents = len(cleaned_text_list)
     num_asps = len(ASPEK_COLUMNS)
+    # Siapkan dataset dan dataloader
     ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
                      tokenizer, CONFIG["max_len"])
     dl = DataLoader(
         num_workers=0
     )
+    # Matrix untuk menyimpan hasil prediksi
     predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
     batch_counter = 0
     total_batch_count = len(dl)
+    # Proses prediksi batch demi batch
     model.eval()
     with torch.no_grad():
         for batch_data in dl:
             sent_idxs = batch_data['sent_idx'].numpy()
             asp_idxs = batch_data['aspect_idx'].numpy()
+            # Prediksi dan konversi ke label
             model_outputs = model(inp_ids, attn_mask)
             probabilities = F.softmax(model_outputs, dim=1)
             predicted_indices = torch.argmax(
                 probabilities, dim=1).cpu().numpy()
             pred_labels = le.inverse_transform(predicted_indices)
+            # Simpan hasil prediksi ke matrix
             for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
                 predictions_matrix[s_idx][a_idx] = lbl
+            # Update progress bar
             batch_counter += 1
             progress = batch_counter / total_batch_count
             progress_bar.progress(progress)
             status_text.text(
                 f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
+    # STEP 3: Gabungkan hasil prediksi dengan data asli
     result_list = []
     for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
         row_dict = data_row.to_dict()
     result_dataframe = pd.DataFrame(result_list)
+    # Simpan hasil chunk ke file CSV
     chunks_directory = get_session_chunks_dir()
     chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
     result_dataframe.to_csv(chunk_filepath, index=False)
+    # Progress selesai
     progress_bar.progress(1.0)
     status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
 def get_available_columns(df):
+    """Deteksi kolom-kolom yang tersedia dalam dataframe untuk filter dan visualisasi dinamis"""
     available = {
         'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
         'has_semester': 'semester' in df.columns,
 st.markdown(" ")
 st.markdown(" ")
+# Panduan pengunaan aplikasi
 steps = [
     {"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
         "description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
 st.markdown("")
 st.markdown("")
+# Upload file Excel
 uploaded_file = st.file_uploader(
     " Upload Data Kritik & Saran",
     type=["xlsx"],
     help="File maksimal 200MB dengan format .xlsx"
 )
+# Tombol untuk hapus cache - KHUSUS PER SESSION
 session_cache_dir = get_session_cache_dir()
 session_result_file = session_cache_dir / "temp_predicted.csv"
 session_chunks_dir = get_session_chunks_dir()
+# Tombol hapus cache data hasil prediksi
 if session_result_file.exists():
     if st.button("Hapus Cache Data"):
         session_result_file.unlink()
         time.sleep(1)
         st.rerun()
+# Tombol hapus cache chunks
 if session_chunks_dir.exists():
     chunk_files = list(session_chunks_dir.glob("*.csv"))
     if chunk_files:
             time.sleep(1)
             st.rerun()
+# Tampilkan info file yang di-cache jika ada
 if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
     if not uploaded_file:
         metadata_file = session_cache_dir / "metadata.txt"
         st.caption(" ")
+# Inisialisasi session state untuk menyimpan hasil prediksi
 if "df_predicted" not in st.session_state:
     st.session_state.df_predicted = None
+# Load dari cache jika tersedia
 if st.session_state.df_predicted is None and session_result_file.exists():
     try:
         df_cached = pd.read_csv(session_result_file)
         st.warning(f"Gagal memuat cache: {e}")
+# Proses file yang di-upload
 if uploaded_file:
     file_bytes = uploaded_file.getvalue()
+    # Cek apakah ini file baru atau file yang sama
     if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
         st.session_state.last_uploaded_file = file_bytes
         st.session_state.uploaded_filename = uploaded_file.name
         try:
             df_uploaded = pd.read_excel(BytesIO(file_bytes))
+            # Konversi kolom tahun jika ada
             if "tahun" in df_uploaded.columns:
                 df_uploaded["tahun"] = pd.to_numeric(
                     df_uploaded["tahun"], errors='coerce').astype('Int64')
         except ValueError as err:
             st.error(f"Gagal membaca file: {err}")
         else:
+            # Validasi kolom kritik_saran wajib ada
             if "kritik_saran" not in df_uploaded.columns:
                 st.error("Kolom 'kritik_saran' tidak ditemukan.")
             else:
+                # Hapus duplikasi berdasarkan kolom kritik_saran
                 df_uploaded = df_uploaded.drop_duplicates(
                     subset=["kritik_saran"])
+                # Tambahkan kolom aspek jika belum ada
                 for aspect_col in ASPEK_COLUMNS:
                     if aspect_col not in df_uploaded.columns:
                         df_uploaded[aspect_col] = None
                 st.markdown("### Preprocessing dan Prediksi")
                 total_rows = len(df_uploaded)
+                # Tentukan apakah menggunakan chunked processing atau tidak
                 use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
                 if use_chunked:
+                    # MODE CHUNKED PROCESSING untuk dataset besar
                     num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
                     info_col1, info_col2, info_col3 = st.columns(3)
                     chunk_status_text = st.empty()
                     overall_status = st.empty()
+                    # Proses setiap chunk
                     for start_idx in range(0, total_rows, CHUNK_SIZE):
                         current_chunk_number = (start_idx // CHUNK_SIZE) + 1
                         current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
                         current_chunk_file = session_chunks_dir / \
                             f"chunk_{current_chunk_number}.csv"
+                        # Cek apakah chunk sudah pernah diproses (ada di cache)
                         if current_chunk_file.exists():
                             chunk_result = pd.read_csv(current_chunk_file)
                             all_chunk_results.append(chunk_result)
                             time.sleep(0.3)
                             continue
+                        # Proses chunk baru
                         chunk_progress_bar.progress(0)
                         chunk_result = process_chunk_batch(
                         )
                         all_chunk_results.append(chunk_result)
+                        # Hitung estimasi waktu tersisa
                         processed = min(start_idx + CHUNK_SIZE, total_rows)
                         progress_pct = (processed / total_rows) * 100
                         elapsed = time.time() - start_time
                         time.sleep(0.3)
+                    # Gabungkan semua hasil chunk
                     chunk_status_text.empty()
                     overall_status.info("🔄 Menggabungkan semua chunks...")
                     df_session = pd.concat(
                     duration = end_time - start_time
                 else:
+                    # MODE BATCH PROCESSING untuk dataset kecil
                     st.info(
                         f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
                     progress_bar = st.progress(0)
                     status_text = st.empty()
+                    # STEP 1: Preprocessing
                     cleaned_text_list = []
                     total_preprocessing = len(df_uploaded)
                             status_text.text(
                                 f"Preprocessing: {idx+1}/{total_preprocessing} rows")
+                    # STEP 2: Prediksi
                     progress_bar.progress(0)
                     status_text.text("Memulai prediksi...")
                     time.sleep(0.3)
                             status_text.text(
                                 f"Predicting: {batch_counter}/{total_batch_count} batches")
+                    # STEP 3: Gabungkan hasil
                     result_list = []
                     for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
                         row_dict = data_row.to_dict()
                     end_time = time.time()
                     duration = end_time - start_time
+                # Simpan hasil ke session state dan cache file
                 st.session_state.df_predicted = df_session
                 df_session.to_csv(session_result_file, index=False)
+                # Simpan metadata nama file
                 metadata_file = session_cache_dir / "metadata.txt"
                 with open(metadata_file, "w", encoding="utf-8") as f:
                     f.write(uploaded_file.name)
+                # Hitung performa processing
                 total_items = total_rows * len(ASPEK_COLUMNS)
                 items_per_second = total_items / duration if duration > 0 else 0
+                # Tampilkan ringkasan hasil processing
                 if use_chunked:
                     st.success(
                         f"✅ **Chunked + Batch Processing selesai!**\n\n"
                         f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
                     )
+# Tampilan hasil prediksi dan visualisasi
 if st.session_state.df_predicted is not None:
     df_predicted = st.session_state.df_predicted
+    # Deteksi kolom yang tersedia untuk filter dinamis
     available_cols = get_available_columns(df_predicted)
     # Sidebar filter dengan pengecekan kolom dinamis
         st.sidebar.info(
             "Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
+    # Filter Mata Kuliah (jika kolom tersedia)
     selected_matkul = []
     if available_cols['has_matkul']:
         matkul_options = sorted(
             selected_matkul = st.sidebar.multiselect(
                 "Nama Mata Kuliah", matkul_options, default=matkul_options)
+    # Filter Program Studi (jika kolom tersedia)
     selected_prodi = []
     if available_cols['has_prodi']:
         prodi_options = sorted(
             selected_prodi = st.sidebar.multiselect(
                 "Program Studi", prodi_options, default=prodi_options)
+    # Filter Tahun (jika kolom tersedia)
     selected_tahun = []
     if available_cols['has_tahun']:
         if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
                 selected_tahun = st.sidebar.multiselect(
                     "Tahun", tahun_options, default=tahun_options)
+    # Filter Semester (jika kolom tersedia)
     selected_semester = []
     if available_cols['has_semester']:
         semester_options = sorted(
             selected_semester = st.sidebar.multiselect(
                 "Semester", semester_options, default=semester_options)
+    # Terapkan semua filter yang dipilih
     df_filtered = df_clean.copy()
     if selected_matkul and available_cols['has_matkul']:
     st.markdown("### Tabel Data Hasil Prediksi")
     st.dataframe(df_filtered, width='stretch')
+    # Tombol download untuk data terfilter dan semua data
     col_dl1, col_dl2 = st.columns(2)
     with col_dl1:
         st.download_button(
     st.markdown("### Ringkasan Cepat")
     st.markdown("")
+    # Hitung total sentimen dari semua aspek
     total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
     total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
     total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
+    # Tentukan kolom ringkasan berdasarkan data yang tersedia
     summary_cols = []
     # Kolom dasar (selalu ada)
     summary_cols.extend(['ulasan', 'aspek'])
+    # Kolom opsional berdasarkan ketersediaan data
     if available_cols['has_matkul']:
         summary_cols.append('matkul')
     if available_cols['has_prodi']:
     if available_cols['has_semester']:
         summary_cols.append('semester')
+    # Buat kolom dinamis untuk menampilkan metrik
     num_cols = len(summary_cols)
     cols = st.columns(num_cols)
     col_idx = 0
+    # Metrik dasar: Jumlah Ulasan & Aspek
     cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
     col_idx += 1
     cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
     col_idx += 1
+    # Metrik Mata Kuliah (jika tersedia)
     if available_cols['has_matkul']:
         matkul_count = df_filtered['nama_matakuliah'].nunique()
         cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
         col_idx += 1
+    # Metrik Prodi (jika tersedia)
     if available_cols['has_prodi']:
         prodi_count = df_filtered['nama_prodi'].nunique()
         cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
         col_idx += 1
+    # Metrik Semester (jika tersedia)
     if available_cols['has_semester']:
         semester_count = df_filtered['semester'].nunique()
         cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
     st.markdown("")
+    # Baris kedua: Metrik Sentimen dan info tambahan
     summary_cols2 = ['positif', 'netral', 'negatif']
     if available_cols['has_tahun']:
     cols2 = st.columns(len(summary_cols2))
     col_idx2 = 0
+    # Metrik untuk masing-masing jenis sentimen
     cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
     col_idx2 += 1
     cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
     cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
     col_idx2 += 1
+    # Metrik Rentang Tahun (jika tersedia)
     if available_cols['has_tahun']:
         if 'tahun' in df_filtered.columns:
             tahun_valid = df_filtered['tahun'].dropna()
             cols2[col_idx2].metric("Rentang Tahun", "N/A")
         col_idx2 += 1
+    # Metrik Rata-rata Panjang Kata (jika tersedia)
     if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
         try:
             word_counts = df_filtered['kritik_saran'].astype(
     with col2:
         show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
+    # Visualisasi distribusi berdasarkan kolom yang tersedia
     viz_shown = False
+    # Visualisasi Tahun dan Semester (jika tersedia)
     if available_cols['has_tahun'] or available_cols['has_semester']:
         col1, col2 = st.columns(2)
         with col1:
                 if result:
                     viz_shown = True
+    # Visualisasi Program Studi (jika tersedia)
     if available_cols['has_prodi']:
         st.markdown("---")
         result = show_prodi_distribution(df_filtered)
         if result:
             viz_shown = True
+    # Visualisasi Top 10 Mata Kuliah (jika tersedia)
     if available_cols['has_matkul']:
         st.markdown("---")
         result = show_top10_matkul_distribution(df_filtered)
         if result:
             viz_shown = True
+    # Visualisasi Sentimen per Tahun/Semester (jika tersedia)
     if available_cols['has_tahun'] or available_cols['has_semester']:
         st.markdown("---")
         col1, col2 = st.columns(2)
                 if result:
                     viz_shown = True
+    # Visualisasi Sentimen per Program Studi (jika tersedia)
     if available_cols['has_prodi']:
         st.markdown("---")
         result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
         if result:
             viz_shown = True
+    # Visualisasi Sentimen per Top 10 Mata Kuliah (jika tersedia)
     if available_cols['has_matkul']:
         st.markdown("---")
         result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
         if result:
             viz_shown = True
+# Footer aplikasi
 st.caption("""
     <div class='footer'>
         © 2025 Darmawan Jiddan | Dibuat dengan ❤️ menggunakan Streamlit

visualization.py CHANGED Viewed

@@ -13,12 +13,13 @@ import plotly.express as px
 from config import ASPEK_COLUMNS
-# Palet warna kustom
 sentimen_palette = {
     "netral": "#FFE24C",
     "positif": "#4CFF72",
     "negatif": "#FF4C4C"
 }
 category_order = ["netral", "positif", "negatif"]
 # Konfigurasi Plotly
@@ -30,20 +31,24 @@ config_options = {
 def show_sentiment_bar_chart(df_predicted, aspek_columns):
     """Menampilkan bar chart distribusi sentimen per aspek."""
     if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
         st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
         return
     df_long = df_predicted.melt(
         value_vars=aspek_columns,
         var_name="aspek",
         value_name="sentimen"
     )
     df_long["sentimen"] = pd.Categorical(
         df_long["sentimen"],
         categories=category_order,
         ordered=True
     )
     count_data = df_long.groupby(
         ["aspek", "sentimen"], observed=False
     ).size().reset_index(name="jumlah")
@@ -62,10 +67,12 @@ def show_sentiment_bar_chart(df_predicted, aspek_columns):
 def show_sentiment_pie_chart(df_predicted, aspek_columns):
     """Menampilkan pie chart distribusi total sentimen."""
     sentimen_total = df_predicted[aspek_columns].values.ravel()
     sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
     sentimen_counts.columns = ["sentimen", "jumlah"]
     sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
     fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
                  color="sentimen", color_discrete_map=sentimen_palette,
                  hole=0.3)
@@ -76,12 +83,13 @@ def show_sentiment_pie_chart(df_predicted, aspek_columns):
 def show_year_distribution(df):
     """Menampilkan distribusi jumlah kritik/saran per tahun."""
-    # Coba ekstrak dari kolom tanggal jika ada
     if 'tanggal' in df.columns and 'tahun' not in df.columns:
         df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
     if 'tahun' not in df.columns:
-        return None  # Return None jika tidak ada kolom tahun
     df_tahun = df.dropna(subset=['tahun']).copy()
     if df_tahun.empty:
@@ -121,6 +129,7 @@ def show_prodi_distribution(df):
     prodi_counts = df['nama_prodi'].value_counts().reset_index()
     prodi_counts.columns = ['nama_prodi', 'jumlah']
     prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
     fig = px.bar(
         prodi_counts,
@@ -142,6 +151,7 @@ def show_top10_matkul_distribution(df):
     if missing_cols:
         return None
     matkul_counts = (
         df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
         .size()
@@ -149,6 +159,7 @@ def show_top10_matkul_distribution(df):
         .sort_values(by='jumlah', ascending=False)
         .head(10)
     )
     matkul_counts['label'] = (
         matkul_counts['kode_matakuliah'] + " - " +
         matkul_counts['nama_matakuliah']
@@ -169,13 +180,14 @@ def show_top10_matkul_distribution(df):
 def show_sentiment_by_year(df, aspek_columns):
     """Menampilkan distribusi sentimen per tahun."""
-    # Coba ekstrak dari kolom tanggal jika ada
     if 'tanggal' in df.columns and 'tahun' not in df.columns:
         df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
     if 'tahun' not in df.columns:
         return None
     df_long = df.melt(id_vars=['tahun'],
                       value_vars=aspek_columns,
                       var_name='aspek',
@@ -230,13 +242,16 @@ def show_sentiment_by_prodi(df, aspek_columns):
         .reset_index(name='jumlah')
     )
     total_per_prodi = (
         prodi_sentiment.groupby('nama_prodi')['jumlah']
         .sum()
         .sort_values(ascending=False)
     )
     ordered_categories = total_per_prodi.index.tolist()[::-1]
     prodi_sentiment['nama_prodi'] = pd.Categorical(
         prodi_sentiment['nama_prodi'],
         categories=ordered_categories,
@@ -269,6 +284,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
     if missing_cols:
         return None
     df_top10 = (
         df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
         .size()
@@ -287,6 +303,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
         value_name='sentimen'
     )
     df_long['label'] = (
         df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
     )
@@ -297,6 +314,7 @@ def show_sentiment_by_top10_matkul(df, aspek_columns):
         .reset_index(name='jumlah')
     )
     total_per_label = (
         matkul_sentiment.groupby('label')['jumlah']
         .sum()

 from config import ASPEK_COLUMNS
+# Definisi warna untuk setiap kategori sentimen
 sentimen_palette = {
     "netral": "#FFE24C",
     "positif": "#4CFF72",
     "negatif": "#FF4C4C"
 }
+# Urutan kategori untuk konsistensi tampilan di semua chart
 category_order = ["netral", "positif", "negatif"]
 # Konfigurasi Plotly
 def show_sentiment_bar_chart(df_predicted, aspek_columns):
     """Menampilkan bar chart distribusi sentimen per aspek."""
+    # Validasi data dan kolom yang diperlukan
     if df_predicted.empty or not set(aspek_columns).issubset(df_predicted.columns):
         st.warning("Data atau kolom aspek tidak tersedia untuk ditampilkan.")
         return
+    # Transformasi dari wide ke long format untuk visualisasi
     df_long = df_predicted.melt(
         value_vars=aspek_columns,
         var_name="aspek",
         value_name="sentimen"
     )
+    # Konversi ke categorical untuk memastikan urutan yang konsisten
     df_long["sentimen"] = pd.Categorical(
         df_long["sentimen"],
         categories=category_order,
         ordered=True
     )
+    # Agregasi data untuk menghitung jumlah per aspek dan sentimen
     count_data = df_long.groupby(
         ["aspek", "sentimen"], observed=False
     ).size().reset_index(name="jumlah")
 def show_sentiment_pie_chart(df_predicted, aspek_columns):
     """Menampilkan pie chart distribusi total sentimen."""
+    # Flatten semua nilai sentimen dari semua aspek menjadi 1D array
     sentimen_total = df_predicted[aspek_columns].values.ravel()
     sentimen_counts = pd.Series(sentimen_total).value_counts().reset_index()
     sentimen_counts.columns = ["sentimen", "jumlah"]
     sentimen_counts = sentimen_counts.sort_values("jumlah", ascending=False)
+    # Donut chart dengan hole parameter
     fig = px.pie(sentimen_counts, names="sentimen", values="jumlah",
                  color="sentimen", color_discrete_map=sentimen_palette,
                  hole=0.3)
 def show_year_distribution(df):
     """Menampilkan distribusi jumlah kritik/saran per tahun."""
+    # Ekstraksi tahun dari kolom tanggal jika kolom tahun tidak tersedia
     if 'tanggal' in df.columns and 'tahun' not in df.columns:
         df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
+    # Return None jika tidak ada data tahun (untuk handling di pemanggil)
     if 'tahun' not in df.columns:
+        return None
     df_tahun = df.dropna(subset=['tahun']).copy()
     if df_tahun.empty:
     prodi_counts = df['nama_prodi'].value_counts().reset_index()
     prodi_counts.columns = ['nama_prodi', 'jumlah']
+    # Sort ascending untuk horizontal bar (nilai kecil di bawah)
     prodi_counts = prodi_counts.sort_values(by='jumlah', ascending=True)
     fig = px.bar(
         prodi_counts,
     if missing_cols:
         return None
+    # Groupby untuk menghitung frekuensi per mata kuliah
     matkul_counts = (
         df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
         .size()
         .sort_values(by='jumlah', ascending=False)
         .head(10)
     )
+    # Gabungkan kode dan nama untuk label yang informatif
     matkul_counts['label'] = (
         matkul_counts['kode_matakuliah'] + " - " +
         matkul_counts['nama_matakuliah']
 def show_sentiment_by_year(df, aspek_columns):
     """Menampilkan distribusi sentimen per tahun."""
+    # Ekstraksi tahun dari kolom tanggal jika diperlukan
     if 'tanggal' in df.columns and 'tahun' not in df.columns:
         df['tahun'] = pd.to_datetime(df['tanggal'], errors='coerce').dt.year
     if 'tahun' not in df.columns:
         return None
+    # Transformasi ke long format dengan id_vars tahun
     df_long = df.melt(id_vars=['tahun'],
                       value_vars=aspek_columns,
                       var_name='aspek',
         .reset_index(name='jumlah')
     )
+    # Hitung total per prodi untuk mengurutkan dari terbanyak ke sedikit
     total_per_prodi = (
         prodi_sentiment.groupby('nama_prodi')['jumlah']
         .sum()
         .sort_values(ascending=False)
     )
+    # Reverse order untuk horizontal bar (nilai besar di atas)
     ordered_categories = total_per_prodi.index.tolist()[::-1]
+    # Konversi ke categorical untuk kontrol urutan tampilan
     prodi_sentiment['nama_prodi'] = pd.Categorical(
         prodi_sentiment['nama_prodi'],
         categories=ordered_categories,
     if missing_cols:
         return None
+    # Filter top 10 mata kuliah berdasarkan frekuensi
     df_top10 = (
         df.groupby(['kode_matakuliah', 'nama_matakuliah'], observed=False)
         .size()
         value_name='sentimen'
     )
+    # Gabungkan kode dan nama untuk label
     df_long['label'] = (
         df_long['kode_matakuliah'] + " - " + df_long['nama_matakuliah']
     )
         .reset_index(name='jumlah')
     )
+    # Urutkan berdasarkan total sentimen per mata kuliah
     total_per_label = (
         matkul_sentiment.groupby('label')['jumlah']
         .sum()