|
|
import streamlit as st |
|
|
import torch |
|
|
import torchaudio |
|
|
|
|
|
from speechbrain.inference.speaker import EncoderClassifier |
|
|
from speechbrain.inference.enhancement import SpectralMaskEnhancement |
|
|
from speechbrain.inference.classifiers import AudioClassifier |
|
|
import os |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
"""Memuat model verifikasi speaker dan KWS.""" |
|
|
|
|
|
spk_model = EncoderClassifier.from_hparams( |
|
|
source="speechbrain/spkrec-xvect-voxceleb", |
|
|
savedir="pretrained_models/spkrec-xvect-voxceleb" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
kws_model = AudioClassifier.from_hparams( |
|
|
source="speechbrain/google_speech_command_xvector", |
|
|
savedir="pretrained_models/google_speech_command_xvector" |
|
|
) |
|
|
|
|
|
|
|
|
enhancer = SpectralMaskEnhancement.from_hparams( |
|
|
source="speechbrain/metricgan-plus-voicebank", |
|
|
savedir="pretrained_models/metricgan-plus-voicebank" |
|
|
) |
|
|
return spk_model, kws_model, enhancer |
|
|
|
|
|
|
|
|
spk_model, kws_model, enhancer = load_models() |
|
|
|
|
|
|
|
|
ENROLL_DIR = "enroll/" |
|
|
THRESHOLD = 0.85 |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_audio(wav_file): |
|
|
"""Memuat, membersihkan, dan mengubah sample rate audio.""" |
|
|
try: |
|
|
|
|
|
sig, fs = torchaudio.load(wav_file) |
|
|
|
|
|
|
|
|
if enhancer: |
|
|
enhanced_sig = enhancer.enhance_batch(sig, lengths=torch.tensor([sig.shape[1]])) |
|
|
sig = enhanced_sig.squeeze(0) |
|
|
|
|
|
|
|
|
if fs != 16000: |
|
|
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) |
|
|
sig = resampler(sig) |
|
|
|
|
|
return sig |
|
|
except Exception as e: |
|
|
st.error(f"Error memproses audio: {e}") |
|
|
return None |
|
|
|
|
|
@st.cache_data |
|
|
def get_enrollment_embeddings(): |
|
|
""" |
|
|
Membuat embedding (sidik jari suara) rata-rata |
|
|
untuk setiap pengguna di folder /enroll. |
|
|
""" |
|
|
enrollment_data = {} |
|
|
if not os.path.exists(ENROLL_DIR): |
|
|
st.warning(f"Folder '{ENROLL_DIR}' tidak ditemukan.") |
|
|
return {} |
|
|
|
|
|
for speaker_name in os.listdir(ENROLL_DIR): |
|
|
speaker_dir = os.path.join(ENROLL_DIR, speaker_name) |
|
|
if os.path.isdir(speaker_dir): |
|
|
embeddings = [] |
|
|
for wav_file in os.listdir(speaker_dir): |
|
|
if wav_file.endswith(".wav"): |
|
|
wav_path = os.path.join(speaker_dir, wav_file) |
|
|
try: |
|
|
sig, fs = torchaudio.load(wav_path) |
|
|
if fs != 16000: |
|
|
resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=16000) |
|
|
sig = resampler(sig) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
emb = spk_model.encode_batch(sig) |
|
|
emb = emb.squeeze() |
|
|
embeddings.append(emb.numpy()) |
|
|
except Exception as e: |
|
|
st.error(f"Gagal memproses {wav_path}: {e}") |
|
|
|
|
|
if embeddings: |
|
|
|
|
|
enrollment_data[speaker_name] = np.mean(embeddings, axis=0) |
|
|
|
|
|
return enrollment_data |
|
|
|
|
|
|
|
|
st.title("Sistem Verifikasi Perintah Suara π") |
|
|
st.write("Unggah file .wav untuk verifikasi.") |
|
|
|
|
|
|
|
|
enrollment_embeddings = get_enrollment_embeddings() |
|
|
|
|
|
if not enrollment_embeddings: |
|
|
st.error("Tidak ada data pendaftaran yang ditemukan. Pastikan folder 'enroll' ada dan berisi file .wav.") |
|
|
else: |
|
|
st.success(f"Berhasil memuat data pendaftaran untuk: {list(enrollment_embeddings.keys())}") |
|
|
|
|
|
uploaded_file = st.file_uploader("Pilih file audio...", type=["wav"]) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
st.audio(uploaded_file, format="audio/wav") |
|
|
|
|
|
if st.button("Verifikasi Sekarang"): |
|
|
with st.spinner("Memproses audio..."): |
|
|
signal = preprocess_audio(uploaded_file) |
|
|
|
|
|
if signal is not None: |
|
|
|
|
|
st.subheader("Tahap 1: Verifikasi Speaker") |
|
|
|
|
|
with torch.no_grad(): |
|
|
upload_embedding = spk_model.encode_batch(signal).squeeze().numpy() |
|
|
|
|
|
best_score = 0 |
|
|
best_match = "Tidak Dikenali" |
|
|
|
|
|
|
|
|
for speaker_name, enrolled_emb in enrollment_embeddings.items(): |
|
|
score = cosine_similarity( |
|
|
upload_embedding.reshape(1, -1), |
|
|
enrolled_emb.reshape(1, -1) |
|
|
)[0][0] |
|
|
|
|
|
st.write(f"Skor kemiripan dengan {speaker_name}: **{score:.2f}**") |
|
|
|
|
|
if score > best_score: |
|
|
best_score = score |
|
|
best_match = speaker_name |
|
|
|
|
|
|
|
|
if best_score > THRESHOLD: |
|
|
st.success(f"β
**Akses Diberikan**: Dikenali sebagai **{best_match}** (Skor: {best_score:.2f})") |
|
|
|
|
|
|
|
|
st.subheader("Tahap 2: Deteksi Perintah") |
|
|
with st.spinner("Mendeteksi perintah..."): |
|
|
with torch.no_grad(): |
|
|
|
|
|
prediction = kws_model.classify_batch(signal) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
top_prob = torch.max(prediction[0]).item() |
|
|
top_label = prediction[3][0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.write(f"Perintah terdeteksi: **{top_label}** (Keyakinan: {top_prob:.2f})") |
|
|
|
|
|
if top_label.lower() == "up": |
|
|
st.balloons() |
|
|
st.success(f"π **Perintah Diterima**: `{best_match}` berkata 'BUKA'.") |
|
|
elif top_label.lower() == "down": |
|
|
st.success(f"π **Perintah Diterima**: `{best_match}` berkata 'TUTUP'.") |
|
|
else: |
|
|
st.warning(f"Perintah '{top_label}' tidak dikenali sebagai 'Buka' atau 'Tutup'.") |
|
|
|
|
|
else: |
|
|
st.error(f"β **Akses Ditolak**: Suara tidak dikenali (Skor tertinggi: {best_score:.2f})") |