# ========================================================== # ADVANCED MULTIMODAL EMOTION & MOOD DETECTOR # Combines Audio + Text + Multiple Models for robust accuracy # ========================================================== import gradio as gr import numpy as np import librosa import soundfile as sf import pandas as pd from transformers import pipeline # ------------------------------- # 1. AUDIO PREPROCESSING # ------------------------------- def preprocess_audio(file_path): """Trim silence, normalize, and clean audio.""" y, sr = librosa.load(file_path, sr=16000) y, _ = librosa.effects.trim(y, top_db=20) if np.max(np.abs(y)) > 0: y = y / np.max(np.abs(y)) y = librosa.effects.preemphasis(y) processed_path = "processed.wav" sf.write(processed_path, y, sr) return processed_path # ------------------------------- # 2. LOAD PIPELINES # ------------------------------- print("šŸ”„ Loading models... this may take a minute") # Speech-based models speech_models = [ "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", ] speech_pipes = [pipeline("audio-classification", model=m) for m in speech_models] # Text-based models asr = pipeline("automatic-speech-recognition", model="openai/whisper-small") text_emotion = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None) # ------------------------------- # 3. EMOTION DETECTION FUNCTION # ------------------------------- def detect_emotion(audio): audio = preprocess_audio(audio) # --- Speech emotion ensemble --- speech_scores = {} for p in speech_pipes: results = p(audio) for r in results: speech_scores[r['label']] = speech_scores.get(r['label'], 0) + r['score'] # Average scores across models for k in speech_scores: speech_scores[k] /= len(speech_pipes) # --- Text emotion (from speech-to-text) --- transcription = asr(audio)["text"] text_results = text_emotion(transcription) text_scores = {} for r in text_results[0]: text_scores[r['label']] = r['score'] # --- Combine both modalities --- combined = {} all_labels = set(list(speech_scores.keys()) + list(text_scores.keys())) for label in all_labels: combined[label] = (speech_scores.get(label, 0) * 0.7) + (text_scores.get(label, 0) * 0.3) # --- Get Top 3 emotions --- top3 = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:3] # --- Prepare output --- summary = f"🧠 Transcribed Text: ā€œ{transcription}ā€\n\nšŸŽ§ Top Detected Emotions:\n" for label, score in top3: summary += f" - {label.capitalize()}: {score*100:.1f}%\n" # Create a DataFrame for plotting df = pd.DataFrame(top3, columns=["Emotion", "Confidence (%)"]) df["Confidence (%)"] = df["Confidence (%)"] * 100 return summary, gr.BarPlot(value=df, x="Emotion", y="Confidence (%)", title="Confidence by Emotion (%)") # ------------------------------- # 4. BUILD THE GRADIO INTERFACE # ------------------------------- demo = gr.Interface( fn=detect_emotion, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="šŸŽ™ļø Speak or Upload Audio"), outputs=["text", gr.BarPlot()], title="šŸŽ­ Advanced Multimodal Emotion & Mood Detector", description=( "This AI listens to your **tone** and analyzes your **words** to understand your emotion.\n" "It combines multiple speech emotion models, text analysis, and ensemble averaging " "for high accuracy." ), theme="soft", allow_flagging="never", css=""" .gradio-container { background: linear-gradient(135deg, #f7f7ff, #eef3ff); color: #222; } #component-0 {font-size: 1.2rem;} """ ) demo.launch()