"""
app.py
------
Aplikasi berbasis web ABSA IndoBERT menggunakan Streamlit untuk analisis sentimen
berbasis aspek dari kritik dan saran mahasiswa.
UPDATED: Dengan Batch + Chunked Processing + Session-based Cache untuk multi-user
UPDATED: Visualisasi dinamis yang menyesuaikan dengan kolom yang tersedia
"""
import os
import time
import gc
import uuid
import shutil
from io import BytesIO
from pathlib import Path
import pandas as pd
import streamlit as st
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from config import CONFIG, ASPEK_COLUMNS
from model_utils import load_model_and_tokenizer, ABSADataset
from visualization import (
show_sentiment_bar_chart,
show_sentiment_pie_chart,
show_year_distribution,
show_semester_distribution,
show_prodi_distribution,
show_top10_matkul_distribution,
show_sentiment_by_year,
show_sentiment_by_semester,
show_sentiment_by_prodi,
show_sentiment_by_top10_matkul,
)
from preprocessing import text_preprocessing_pipeline
# Konfigurasi untuk chunked processing (membagi data besar menjadi bagian kecil)
CHUNK_SIZE = 2500
ENABLE_CHUNKED = True
CACHE_EXPIRY_HOURS = 24
# Buat direktori untuk menyimpan cache file
os.makedirs("chache_file", exist_ok=True)
os.makedirs("chache_file/sessions", exist_ok=True)
# Konfigurasi halaman
st.set_page_config(
page_title="ABSA IndoBERT",
layout="wide",
page_icon="💬"
)
# Load custom CSS
with open(os.path.join("assets", "style.css"), encoding="utf-8") as f:
st.markdown(f"", unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
def get_session_id():
"""Generate atau ambil session ID untuk user - tetap ada meski refresh halaman"""
query_params = st.query_params
# Cek apakah session ID sudah ada di URL parameter
if "sid" in query_params:
sid = query_params["sid"]
st.session_state.session_id = sid
return sid
# Jika belum ada, buat session ID baru
if "session_id" not in st.session_state:
new_session_id = str(uuid.uuid4())
st.session_state.session_id = new_session_id
st.query_params["sid"] = new_session_id
return new_session_id
# Jika sudah ada di session state, gunakan yang existing
existing_id = st.session_state.session_id
st.query_params["sid"] = existing_id
return existing_id
def get_session_cache_dir():
"""Dapatkan direktori cache khusus untuk session ini"""
sid = get_session_id()
cache_dir = Path(f"chache_file/sessions/{sid}")
cache_dir.mkdir(parents=True, exist_ok=True)
return cache_dir
def get_session_chunks_dir():
"""Dapatkan direktori chunks khusus untuk session ini"""
chunks_dir = get_session_cache_dir() / "chunks"
chunks_dir.mkdir(parents=True, exist_ok=True)
return chunks_dir
def cleanup_old_sessions():
"""Hapus cache session yang sudah expired (lebih dari 24 jam)"""
sessions_dir = Path("chache_file/sessions")
if not sessions_dir.exists():
return
current_time = time.time()
for session_dir in sessions_dir.iterdir():
if session_dir.is_dir():
mod_time = session_dir.stat().st_mtime
age_hours = (current_time - mod_time) / 3600
# Hapus jika sudah lebih dari CACHE_EXPIRY_HOURS
if age_hours > CACHE_EXPIRY_HOURS:
try:
shutil.rmtree(session_dir)
print(f"Deleted expired session: {session_dir.name}")
except OSError as e:
print(f"Error deleting session {session_dir.name}: {e}")
# Jalankan cleanup saat aplikasi dimulai
cleanup_old_sessions()
@st.cache_resource(show_spinner=False)
def get_model_resources():
"""Memuat model dan tokenizer IndoBERT (di-cache agar tidak reload terus)"""
return load_model_and_tokenizer()
# Load model dan tokenizer dengan spinner
with st.spinner("Sedang memuat model IndoBERT dan tokenizer... Harap tunggu sebentar!"):
model, tokenizer, le, device = get_model_resources()
# Tampilkan notifikasi sukses sementara
success_placeholder = st.empty()
success_placeholder.success("Model dan tokenizer berhasil dimuat!")
time.sleep(1)
success_placeholder.empty()
def convert_df_to_excel(df):
"""Mengubah DataFrame menjadi file Excel dalam bentuk byte stream untuk download"""
output = BytesIO()
with pd.ExcelWriter(output, engine="openpyxl") as writer:
df.to_excel(writer, index=False)
return output.getvalue()
def clear_memory():
"""Bersihkan memory cache untuk optimasi performa"""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def process_chunk_batch(chunk_dataframe, chunk_num, total_chunk_count, progress_bar, status_text):
"""
Memproses satu chunk data dengan batch processing.
Progress bar menunjukkan: Preprocessing 0-100%, lalu Predicting 0-100%
"""
# STEP 1: Preprocessing teks (0-100%)
cleaned_text_list = []
total_rows = len(chunk_dataframe)
for idx, raw_text in enumerate(chunk_dataframe["kritik_saran"]):
clean_text = text_preprocessing_pipeline(str(raw_text))
cleaned_text_list.append(clean_text)
# Update progress bar setiap 50 baris
if idx % 50 == 0 or idx == total_rows - 1:
progress = (idx + 1) / total_rows
progress_bar.progress(progress)
status_text.text(
f"Chunk {chunk_num}/{total_chunk_count} | Preprocessing: {idx+1}/{total_rows} rows")
# Reset progress bar untuk tahap prediksi
progress_bar.progress(0)
status_text.text(
f"Chunk {chunk_num}/{total_chunk_count} | Memulai prediksi...")
time.sleep(0.2)
# STEP 2: Batch Prediction dengan model (0-100%)
batch_sz = CONFIG.get("batch_size", 32)
num_sents = len(cleaned_text_list)
num_asps = len(ASPEK_COLUMNS)
# Siapkan dataset dan dataloader
ds = ABSADataset(cleaned_text_list, ASPEK_COLUMNS,
tokenizer, CONFIG["max_len"])
dl = DataLoader(
ds,
batch_size=batch_sz,
shuffle=False,
num_workers=0
)
# Matrix untuk menyimpan hasil prediksi
predictions_matrix = [[None] * num_asps for _ in range(num_sents)]
batch_counter = 0
total_batch_count = len(dl)
# Proses prediksi batch demi batch
model.eval()
with torch.no_grad():
for batch_data in dl:
inp_ids = batch_data['input_ids'].to(device)
attn_mask = batch_data['attention_mask'].to(device)
sent_idxs = batch_data['sent_idx'].numpy()
asp_idxs = batch_data['aspect_idx'].numpy()
# Prediksi dan konversi ke label
model_outputs = model(inp_ids, attn_mask)
probabilities = F.softmax(model_outputs, dim=1)
predicted_indices = torch.argmax(
probabilities, dim=1).cpu().numpy()
pred_labels = le.inverse_transform(predicted_indices)
# Simpan hasil prediksi ke matrix
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
predictions_matrix[s_idx][a_idx] = lbl
# Update progress bar
batch_counter += 1
progress = batch_counter / total_batch_count
progress_bar.progress(progress)
status_text.text(
f"Chunk {chunk_num}/{total_chunk_count} | Predicting: {batch_counter}/{total_batch_count} batches")
# STEP 3: Gabungkan hasil prediksi dengan data asli
result_list = []
for idx, (_, data_row) in enumerate(chunk_dataframe.iterrows()):
row_dict = data_row.to_dict()
row_dict["kritik_saran"] = cleaned_text_list[idx]
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
result_list.append(row_dict)
result_dataframe = pd.DataFrame(result_list)
# Simpan hasil chunk ke file CSV
chunks_directory = get_session_chunks_dir()
chunk_filepath = chunks_directory / f"chunk_{chunk_num}.csv"
result_dataframe.to_csv(chunk_filepath, index=False)
# Progress selesai
progress_bar.progress(1.0)
status_text.text(f"Chunk {chunk_num}/{total_chunk_count} | Selesai!")
clear_memory()
return result_dataframe
def get_available_columns(df):
"""Deteksi kolom-kolom yang tersedia dalam dataframe untuk filter dan visualisasi dinamis"""
available = {
'has_tahun': 'tahun' in df.columns or 'tanggal' in df.columns,
'has_semester': 'semester' in df.columns,
'has_prodi': 'nama_prodi' in df.columns,
'has_matkul': 'nama_matakuliah' in df.columns and 'kode_matakuliah' in df.columns,
}
return available
# Judul aplikasi
st.markdown("""
ABSA IndoBERT
Analisis otomatis kritik dan saran berdasarkan aspek tertentu dan sentimen dengan model IndoBERT.
""", unsafe_allow_html=True)
st.markdown(" ")
st.markdown(" ")
st.markdown(" ")
# Panduan pengunaan aplikasi
steps = [
{"icon": "bi bi-cloud-arrow-up", "title": "1. Upload File Excel",
"description": "Siapkan dan upload file Excel kritik dan saran yang wajib memiliki kolom `kritik_saran`."},
{"icon": "bi bi-hourglass-split", "title": "2. Proses Data Otomatis",
"description": "Website akan melakukan preprocessing dan menjalankan model prediksi sentimen."},
{"icon": "bi bi-bar-chart", "title": "3. Analisis & Visualisasi",
"description": "Lihat hasil lengkap berupa tabel data terprediksi, metrik, dan visualisasi sentimen."},
{"icon": "bi bi-cloud-arrow-down", "title": "4. Unduh Hasil Analisis",
"description": "Unduh hasil analisis lengkap Anda dalam format file Excel untuk laporan lebih lanjut."}
]
cols = st.columns(len(steps))
for i, step in enumerate(steps):
with cols[i]:
st.markdown(f"""
{step['title']}
{step['description']}
""", unsafe_allow_html=True)
st.markdown("")
st.markdown("")
# Upload file Excel
uploaded_file = st.file_uploader(
" Upload Data Kritik & Saran",
type=["xlsx"],
help="File maksimal 200MB dengan format .xlsx"
)
# Tombol untuk hapus cache - KHUSUS PER SESSION
session_cache_dir = get_session_cache_dir()
session_result_file = session_cache_dir / "temp_predicted.csv"
session_chunks_dir = get_session_chunks_dir()
# Tombol hapus cache data hasil prediksi
if session_result_file.exists():
if st.button("Hapus Cache Data"):
session_result_file.unlink()
st.success("Cache Data dihapus, silahkan refresh!")
time.sleep(1)
st.rerun()
# Tombol hapus cache chunks
if session_chunks_dir.exists():
chunk_files = list(session_chunks_dir.glob("*.csv"))
if chunk_files:
if st.button(f"Hapus {len(chunk_files)} Chunk Cache", key="delete_chunks"):
for chunk_f in chunk_files:
chunk_f.unlink()
st.success(
f"{len(chunk_files)} Chunk cache berhasil dihapus, silahkan refresh!")
time.sleep(1)
st.rerun()
# Tampilkan info file yang di-cache jika ada
if session_result_file.exists() or (session_chunks_dir.exists() and list(session_chunks_dir.glob("*.csv"))):
if not uploaded_file:
metadata_file = session_cache_dir / "metadata.txt"
cached_filename = None
if metadata_file.exists():
try:
with open(metadata_file, "r", encoding="utf-8") as f:
cached_filename = f.read().strip()
except Exception:
if "uploaded_filename" in st.session_state:
cached_filename = st.session_state.uploaded_filename
if cached_filename and cached_filename != "":
st.caption(f"File Sebelumnya: {cached_filename}")
else:
st.caption("Cache dari upload sebelumnya")
else:
st.caption(" ")
# Inisialisasi session state untuk menyimpan hasil prediksi
if "df_predicted" not in st.session_state:
st.session_state.df_predicted = None
# Load dari cache jika tersedia
if st.session_state.df_predicted is None and session_result_file.exists():
try:
df_cached = pd.read_csv(session_result_file)
if "tahun" in df_cached.columns:
df_cached["tahun"] = pd.to_numeric(
df_cached["tahun"], errors='coerce').astype('Int64')
st.session_state.df_predicted = df_cached
st.info("Loaded from your session cache!")
except (pd.errors.EmptyDataError, FileNotFoundError) as e:
st.warning(f"Gagal memuat cache: {e}")
# Proses file yang di-upload
if uploaded_file:
file_bytes = uploaded_file.getvalue()
# Cek apakah ini file baru atau file yang sama
if "last_uploaded_file" not in st.session_state or st.session_state.last_uploaded_file != file_bytes:
st.session_state.last_uploaded_file = file_bytes
st.session_state.uploaded_filename = uploaded_file.name
try:
df_uploaded = pd.read_excel(BytesIO(file_bytes))
# Konversi kolom tahun jika ada
if "tahun" in df_uploaded.columns:
df_uploaded["tahun"] = pd.to_numeric(
df_uploaded["tahun"], errors='coerce').astype('Int64')
except ValueError as err:
st.error(f"Gagal membaca file: {err}")
else:
# Validasi kolom kritik_saran wajib ada
if "kritik_saran" not in df_uploaded.columns:
st.error("Kolom 'kritik_saran' tidak ditemukan.")
else:
# Hapus duplikasi berdasarkan kolom kritik_saran
df_uploaded = df_uploaded.drop_duplicates(
subset=["kritik_saran"])
# Tambahkan kolom aspek jika belum ada
for aspect_col in ASPEK_COLUMNS:
if aspect_col not in df_uploaded.columns:
df_uploaded[aspect_col] = None
st.markdown("### Preprocessing dan Prediksi")
total_rows = len(df_uploaded)
# Tentukan apakah menggunakan chunked processing atau tidak
use_chunked = ENABLE_CHUNKED and total_rows > CHUNK_SIZE
if use_chunked:
# MODE CHUNKED PROCESSING untuk dataset besar
num_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
info_col1, info_col2, info_col3 = st.columns(3)
with info_col1:
st.info(f"**Total data:** {total_rows:,} rows")
with info_col2:
st.warning(
f"**Mode:** Chunked Processing ({CHUNK_SIZE:,} rows/chunk)")
with info_col3:
st.info(f"**Total chunks:** {num_chunks}")
start_time = time.time()
all_chunk_results = []
chunk_progress_bar = st.progress(0)
chunk_status_text = st.empty()
overall_status = st.empty()
# Proses setiap chunk
for start_idx in range(0, total_rows, CHUNK_SIZE):
current_chunk_number = (start_idx // CHUNK_SIZE) + 1
current_chunk_df = df_uploaded.iloc[start_idx:start_idx+CHUNK_SIZE].copy(
)
current_chunk_file = session_chunks_dir / \
f"chunk_{current_chunk_number}.csv"
# Cek apakah chunk sudah pernah diproses (ada di cache)
if current_chunk_file.exists():
chunk_result = pd.read_csv(current_chunk_file)
all_chunk_results.append(chunk_result)
processed = min(start_idx + CHUNK_SIZE, total_rows)
progress_pct = (processed / total_rows) * 100
chunk_progress_bar.progress(1.0)
chunk_status_text.text(
f"Chunk {current_chunk_number}/{num_chunks} | Loaded from cache"
)
overall_status.success(
f"✅ Chunk {current_chunk_number}/{num_chunks} loaded from cache | "
f"Progress: {processed:,}/{total_rows:,} ({progress_pct:.1f}%)"
)
time.sleep(0.3)
continue
# Proses chunk baru
chunk_progress_bar.progress(0)
chunk_result = process_chunk_batch(
current_chunk_df, current_chunk_number, num_chunks,
chunk_progress_bar, chunk_status_text
)
all_chunk_results.append(chunk_result)
# Hitung estimasi waktu tersisa
processed = min(start_idx + CHUNK_SIZE, total_rows)
progress_pct = (processed / total_rows) * 100
elapsed = time.time() - start_time
est_total = (elapsed / processed) * total_rows
est_remaining = est_total - elapsed
overall_status.success(
f"✅ Chunk {current_chunk_number}/{num_chunks} selesai | "
f"Progress: {processed:,}/{total_rows:,} ({progress_pct:.1f}%) | "
f"Elapsed: {elapsed:.0f}s | ETA: {est_remaining:.0f}s"
)
time.sleep(0.3)
# Gabungkan semua hasil chunk
chunk_status_text.empty()
overall_status.info("🔄 Menggabungkan semua chunks...")
df_session = pd.concat(
all_chunk_results, ignore_index=True)
overall_status.empty()
end_time = time.time()
duration = end_time - start_time
else:
# MODE BATCH PROCESSING untuk dataset kecil
st.info(
f"**Total data:** {total_rows:,} rows | **Mode:** Batch Processing")
start_time = time.time()
progress_bar = st.progress(0)
status_text = st.empty()
# STEP 1: Preprocessing
cleaned_text_list = []
total_preprocessing = len(df_uploaded)
for idx, raw_text in enumerate(df_uploaded["kritik_saran"]):
clean_text = text_preprocessing_pipeline(str(raw_text))
cleaned_text_list.append(clean_text)
if idx % 50 == 0 or idx == total_preprocessing - 1:
progress = (idx + 1) / total_preprocessing
progress_bar.progress(progress)
status_text.text(
f"Preprocessing: {idx+1}/{total_preprocessing} rows")
# STEP 2: Prediksi
progress_bar.progress(0)
status_text.text("Memulai prediksi...")
time.sleep(0.3)
batch_sz = CONFIG.get("batch_size", 32)
num_sents = len(cleaned_text_list)
num_asps = len(ASPEK_COLUMNS)
ds = ABSADataset(
cleaned_text_list, ASPEK_COLUMNS, tokenizer, CONFIG["max_len"])
dl = DataLoader(
ds, batch_size=batch_sz, shuffle=False, num_workers=0)
predictions_matrix = [
[None] * num_asps for _ in range(num_sents)]
batch_counter = 0
total_batch_count = len(dl)
model.eval()
with torch.no_grad():
for batch_data in dl:
inp_ids = batch_data['input_ids'].to(device)
attn_mask = batch_data['attention_mask'].to(device)
sent_idxs = batch_data['sent_idx'].numpy()
asp_idxs = batch_data['aspect_idx'].numpy()
model_outputs = model(inp_ids, attn_mask)
probabilities = F.softmax(model_outputs, dim=1)
predicted_indices = torch.argmax(
probabilities, dim=1).cpu().numpy()
pred_labels = le.inverse_transform(
predicted_indices)
for s_idx, a_idx, lbl in zip(sent_idxs, asp_idxs, pred_labels):
predictions_matrix[s_idx][a_idx] = lbl
batch_counter += 1
progress = batch_counter / total_batch_count
progress_bar.progress(progress)
status_text.text(
f"Predicting: {batch_counter}/{total_batch_count} batches")
# STEP 3: Gabungkan hasil
result_list = []
for idx, (_, data_row) in enumerate(df_uploaded.iterrows()):
row_dict = data_row.to_dict()
row_dict["kritik_saran"] = cleaned_text_list[idx]
for asp_idx, asp_name in enumerate(ASPEK_COLUMNS):
row_dict[asp_name] = predictions_matrix[idx][asp_idx]
result_list.append(row_dict)
df_session = pd.DataFrame(result_list)
progress_bar.progress(1.0)
status_text.text("Selesai!")
time.sleep(0.5)
progress_bar.empty()
status_text.empty()
end_time = time.time()
duration = end_time - start_time
# Simpan hasil ke session state dan cache file
st.session_state.df_predicted = df_session
df_session.to_csv(session_result_file, index=False)
# Simpan metadata nama file
metadata_file = session_cache_dir / "metadata.txt"
with open(metadata_file, "w", encoding="utf-8") as f:
f.write(uploaded_file.name)
# Hitung performa processing
total_items = total_rows * len(ASPEK_COLUMNS)
items_per_second = total_items / duration if duration > 0 else 0
# Tampilkan ringkasan hasil processing
if use_chunked:
st.success(
f"✅ **Chunked + Batch Processing selesai!**\n\n"
f"- **{total_rows:,}** ulasan diproses\n"
f"- **{len(ASPEK_COLUMNS)}** aspek per ulasan\n"
f"- **{total_items:,}** total prediksi\n"
f"- Diproses dalam **{num_chunks}** chunk\n"
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)\n"
f"- Optimized untuk dataset besar!"
)
else:
st.success(
f"✅ **Batch Processing selesai!**\n\n"
f"- **{total_rows:,}** ulasan diproses\n"
f"- **{len(ASPEK_COLUMNS)}** aspek per ulasan\n"
f"- **{total_items:,}** total prediksi\n"
f"- Waktu: **{duration:.2f}** detik (~{items_per_second:.1f} prediksi/detik)"
)
# Tampilan hasil prediksi dan visualisasi
if st.session_state.df_predicted is not None:
df_predicted = st.session_state.df_predicted
# Deteksi kolom yang tersedia untuk filter dinamis
available_cols = get_available_columns(df_predicted)
# Sidebar filter dengan pengecekan kolom dinamis
st.sidebar.header("Filter Data")
df_clean = df_predicted.copy()
# Cek apakah ada filter yang tersedia
has_any_filter = any(available_cols.values())
if not has_any_filter:
st.sidebar.info(
"Tidak ada kolom yang dapat difilter. Pastikan file memiliki kolom seperti: nama_matakuliah, nama_prodi, tahun/tanggal, atau semester.")
# Filter Mata Kuliah (jika kolom tersedia)
selected_matkul = []
if available_cols['has_matkul']:
matkul_options = sorted(
[x for x in df_clean["nama_matakuliah"].dropna().unique() if x])
if matkul_options:
selected_matkul = st.sidebar.multiselect(
"Nama Mata Kuliah", matkul_options, default=matkul_options)
# Filter Program Studi (jika kolom tersedia)
selected_prodi = []
if available_cols['has_prodi']:
prodi_options = sorted(
[x for x in df_clean["nama_prodi"].dropna().unique() if x])
if prodi_options:
selected_prodi = st.sidebar.multiselect(
"Program Studi", prodi_options, default=prodi_options)
# Filter Tahun (jika kolom tersedia)
selected_tahun = []
if available_cols['has_tahun']:
if 'tanggal' in df_clean.columns and 'tahun' not in df_clean.columns:
df_clean['tahun'] = pd.to_datetime(
df_clean['tanggal'], errors='coerce').dt.year
if 'tahun' in df_clean.columns:
tahun_options = sorted(
[x for x in df_clean["tahun"].dropna().unique() if pd.notna(x)])
if tahun_options:
selected_tahun = st.sidebar.multiselect(
"Tahun", tahun_options, default=tahun_options)
# Filter Semester (jika kolom tersedia)
selected_semester = []
if available_cols['has_semester']:
semester_options = sorted(
[x for x in df_clean["semester"].dropna().unique() if pd.notna(x)])
if semester_options:
selected_semester = st.sidebar.multiselect(
"Semester", semester_options, default=semester_options)
# Terapkan semua filter yang dipilih
df_filtered = df_clean.copy()
if selected_matkul and available_cols['has_matkul']:
df_filtered = df_filtered[df_filtered["nama_matakuliah"].isin(
selected_matkul)]
if selected_prodi and available_cols['has_prodi']:
df_filtered = df_filtered[df_filtered["nama_prodi"].isin(
selected_prodi)]
if selected_tahun and available_cols['has_tahun']:
df_filtered = df_filtered[df_filtered["tahun"].isin(selected_tahun)]
if selected_semester and available_cols['has_semester']:
df_filtered = df_filtered[df_filtered["semester"].isin(
selected_semester)]
# Tampilkan tabel hasil prediksi
st.markdown("### Tabel Data Hasil Prediksi")
st.dataframe(df_filtered, width='stretch')
# Tombol download untuk data terfilter dan semua data
col_dl1, col_dl2 = st.columns(2)
with col_dl1:
st.download_button(
label="Unduh Data Terfilter",
data=convert_df_to_excel(df_filtered),
file_name="hasil_prediksi_absa_filtered.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True
)
with col_dl2:
st.download_button(
label="Unduh Semua Data",
data=convert_df_to_excel(df_predicted),
file_name="hasil_prediksi_absa_all.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
use_container_width=True
)
st.info(
f"Menampilkan {len(df_filtered):,} dari {len(df_predicted):,} data ulasan setelah difilter."
)
# Ringkasan Cepat
st.markdown("")
st.markdown("### Ringkasan Cepat")
st.markdown("")
# Hitung total sentimen dari semua aspek
total_pos = (df_filtered[ASPEK_COLUMNS] == "positif").sum().sum()
total_net = (df_filtered[ASPEK_COLUMNS] == "netral").sum().sum()
total_neg = (df_filtered[ASPEK_COLUMNS] == "negatif").sum().sum()
# Tentukan kolom ringkasan berdasarkan data yang tersedia
summary_cols = []
# Kolom dasar (selalu ada)
summary_cols.extend(['ulasan', 'aspek'])
# Kolom opsional berdasarkan ketersediaan data
if available_cols['has_matkul']:
summary_cols.append('matkul')
if available_cols['has_prodi']:
summary_cols.append('prodi')
if available_cols['has_semester']:
summary_cols.append('semester')
# Buat kolom dinamis untuk menampilkan metrik
num_cols = len(summary_cols)
cols = st.columns(num_cols)
col_idx = 0
# Metrik dasar: Jumlah Ulasan & Aspek
cols[col_idx].metric("Jumlah Ulasan", f"{len(df_filtered):,}")
col_idx += 1
cols[col_idx].metric("Jumlah Aspek", len(ASPEK_COLUMNS))
col_idx += 1
# Metrik Mata Kuliah (jika tersedia)
if available_cols['has_matkul']:
matkul_count = df_filtered['nama_matakuliah'].nunique()
cols[col_idx].metric("Jumlah Mata Kuliah", f"{matkul_count:,}")
col_idx += 1
# Metrik Prodi (jika tersedia)
if available_cols['has_prodi']:
prodi_count = df_filtered['nama_prodi'].nunique()
cols[col_idx].metric("Jumlah Prodi", f"{prodi_count:,}")
col_idx += 1
# Metrik Semester (jika tersedia)
if available_cols['has_semester']:
semester_count = df_filtered['semester'].nunique()
cols[col_idx].metric("Jumlah Semester", f"{semester_count:,}")
col_idx += 1
st.markdown("")
# Baris kedua: Metrik Sentimen dan info tambahan
summary_cols2 = ['positif', 'netral', 'negatif']
if available_cols['has_tahun']:
summary_cols2.append('tahun')
if 'kritik_saran' in df_filtered.columns:
summary_cols2.append('kata')
cols2 = st.columns(len(summary_cols2))
col_idx2 = 0
# Metrik untuk masing-masing jenis sentimen
cols2[col_idx2].metric("Sentimen Positif", f"{total_pos:,}")
col_idx2 += 1
cols2[col_idx2].metric("Sentimen Netral", f"{total_net:,}")
col_idx2 += 1
cols2[col_idx2].metric("Sentimen Negatif", f"{total_neg:,}")
col_idx2 += 1
# Metrik Rentang Tahun (jika tersedia)
if available_cols['has_tahun']:
if 'tahun' in df_filtered.columns:
tahun_valid = df_filtered['tahun'].dropna()
if len(tahun_valid) > 0:
tahun_min = int(tahun_valid.min())
tahun_max = int(tahun_valid.max())
if tahun_min == tahun_max:
cols2[col_idx2].metric("Tahun", f"{tahun_min}")
else:
cols2[col_idx2].metric(
"Rentang Tahun", f"{tahun_min} - {tahun_max}")
else:
cols2[col_idx2].metric("Rentang Tahun", "N/A")
else:
cols2[col_idx2].metric("Rentang Tahun", "N/A")
col_idx2 += 1
# Metrik Rata-rata Panjang Kata (jika tersedia)
if 'kritik_saran' in df_filtered.columns and len(df_filtered) > 0:
try:
word_counts = df_filtered['kritik_saran'].astype(
str).str.split().str.len()
avg_word_count = round(word_counts.mean(), 1)
cols2[col_idx2].metric(
"Rata-rata Panjang Kata", f"{avg_word_count} kata")
except Exception:
cols2[col_idx2].metric("Rata-rata Panjang Kata", "N/A")
st.markdown("---")
st.markdown("### Visualisasi Data")
# Visualisasi Sentimen (selalu ditampilkan)
col1, col2 = st.columns(2)
with col1:
show_sentiment_bar_chart(df_filtered, ASPEK_COLUMNS)
with col2:
show_sentiment_pie_chart(df_filtered, ASPEK_COLUMNS)
# Visualisasi distribusi berdasarkan kolom yang tersedia
viz_shown = False
# Visualisasi Tahun dan Semester (jika tersedia)
if available_cols['has_tahun'] or available_cols['has_semester']:
col1, col2 = st.columns(2)
with col1:
if available_cols['has_tahun']:
result = show_year_distribution(df_filtered)
if result:
viz_shown = True
with col2:
if available_cols['has_semester']:
result = show_semester_distribution(df_filtered)
if result:
viz_shown = True
# Visualisasi Program Studi (jika tersedia)
if available_cols['has_prodi']:
st.markdown("---")
result = show_prodi_distribution(df_filtered)
if result:
viz_shown = True
# Visualisasi Top 10 Mata Kuliah (jika tersedia)
if available_cols['has_matkul']:
st.markdown("---")
result = show_top10_matkul_distribution(df_filtered)
if result:
viz_shown = True
# Visualisasi Sentimen per Tahun/Semester (jika tersedia)
if available_cols['has_tahun'] or available_cols['has_semester']:
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
if available_cols['has_tahun']:
result = show_sentiment_by_year(df_filtered, ASPEK_COLUMNS)
if result:
viz_shown = True
with col2:
if available_cols['has_semester']:
result = show_sentiment_by_semester(df_filtered, ASPEK_COLUMNS)
if result:
viz_shown = True
# Visualisasi Sentimen per Program Studi (jika tersedia)
if available_cols['has_prodi']:
st.markdown("---")
result = show_sentiment_by_prodi(df_filtered, ASPEK_COLUMNS)
if result:
viz_shown = True
# Visualisasi Sentimen per Top 10 Mata Kuliah (jika tersedia)
if available_cols['has_matkul']:
st.markdown("---")
result = show_sentiment_by_top10_matkul(df_filtered, ASPEK_COLUMNS)
if result:
viz_shown = True
# Footer aplikasi
st.caption("""
""", unsafe_allow_html=True)