# ==========================================================
# ADVANCED MULTIMODAL EMOTION & MOOD DETECTOR
# Combines Audio + Text + Multiple Models for robust accuracy
# ==========================================================

import gradio as gr
import numpy as np
import librosa
import soundfile as sf
import pandas as pd
from transformers import pipeline

# -------------------------------
# 1. AUDIO PREPROCESSING
# -------------------------------
def preprocess_audio(file_path):
    """Trim silence, normalize, and clean audio."""
    y, sr = librosa.load(file_path, sr=16000)
    y, _ = librosa.effects.trim(y, top_db=20)
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    y = librosa.effects.preemphasis(y)
    processed_path = "processed.wav"
    sf.write(processed_path, y, sr)
    return processed_path

# -------------------------------
# 2. LOAD PIPELINES
# -------------------------------
print("🔄 Loading models... this may take a minute")

# Speech-based models
speech_models = [
    "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
    "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim",
]

speech_pipes = [pipeline("audio-classification", model=m) for m in speech_models]

# Text-based models
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
text_emotion = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

# -------------------------------
# 3. EMOTION DETECTION FUNCTION
# -------------------------------
def detect_emotion(audio):
    audio = preprocess_audio(audio)

    # --- Speech emotion ensemble ---
    speech_scores = {}
    for p in speech_pipes:
        results = p(audio)
        for r in results:
            speech_scores[r['label']] = speech_scores.get(r['label'], 0) + r['score']

    # Average scores across models
    for k in speech_scores:
        speech_scores[k] /= len(speech_pipes)

    # --- Text emotion (from speech-to-text) ---
    transcription = asr(audio)["text"]
    text_results = text_emotion(transcription)
    text_scores = {}
    for r in text_results[0]:
        text_scores[r['label']] = r['score']

    # --- Combine both modalities ---
    combined = {}
    all_labels = set(list(speech_scores.keys()) + list(text_scores.keys()))
    for label in all_labels:
        combined[label] = (speech_scores.get(label, 0) * 0.7) + (text_scores.get(label, 0) * 0.3)

    # --- Get Top 3 emotions ---
    top3 = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:3]

    # --- Prepare output ---
    summary = f"🧠 Transcribed Text: “{transcription}”\n\n🎧 Top Detected Emotions:\n"
    for label, score in top3:
        summary += f" - {label.capitalize()}: {score*100:.1f}%\n"

    # Create a DataFrame for plotting
    df = pd.DataFrame(top3, columns=["Emotion", "Confidence (%)"])
    df["Confidence (%)"] = df["Confidence (%)"] * 100

    return summary, gr.BarPlot(value=df, x="Emotion", y="Confidence (%)", title="Confidence by Emotion (%)")

# -------------------------------
# 4. BUILD THE GRADIO INTERFACE
# -------------------------------
demo = gr.Interface(
    fn=detect_emotion,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎙️ Speak or Upload Audio"),
    outputs=["text", gr.BarPlot()],
    title="🎭 Advanced Multimodal Emotion & Mood Detector",
    description=(
        "This AI listens to your **tone** and analyzes your **words** to understand your emotion.\n"
        "It combines multiple speech emotion models, text analysis, and ensemble averaging "
        "for high accuracy."
    ),
    theme="soft",
    allow_flagging="never",
    css="""
    .gradio-container {
        background: linear-gradient(135deg, #f7f7ff, #eef3ff);
        color: #222;
    }
    #component-0 {font-size: 1.2rem;}
    """
)

demo.launch()