File size: 2,599 Bytes
cc9c39b
 
67a8fe6
cc9c39b
67a8fe6
 
 
de622d7
67a8fe6
 
 
de622d7
67a8fe6
 
 
 
cc9c39b
67a8fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
from transformers import pipeline
from PIL import Image

TEXT_MODEL  = "j-hartmann/emotion-english-distilroberta-base"   
IMAGE_MODEL = "trpakov/vit-face-expression"                     
AUDIO_MODEL = "superb/hubert-large-superb-er"                   

text_pipe  = pipeline("text-classification",  model=TEXT_MODEL,  return_all_scores=True)
image_pipe = pipeline("image-classification", model=IMAGE_MODEL, top_k=None)
audio_pipe = pipeline("audio-classification", model=AUDIO_MODEL, top_k=None)

def _as_label_dict(preds):
    
    preds_sorted = sorted(preds, key=lambda p: p["score"], reverse=True)
    return {p["label"]: float(round(p["score"], 4)) for p in preds_sorted}

def analyze_text(text: str):
    if not text or not text.strip():
        return {"(enter some text)": 1.0}
    preds = text_pipe(text)[0]
    return _as_label_dict(preds)

def analyze_face(img):
    if img is None:
        return {"(no image)": 1.0}
    if isinstance(img, Image.Image):
        pil = img
    else:
        pil = Image.fromarray(img)
    preds = image_pipe(pil)
    return _as_label_dict(preds)

def analyze_voice(audio_path):
    if audio_path is None:
        return {"(no audio)": 1.0}
    preds = audio_pipe(audio_path)  
    return _as_label_dict(preds)

with gr.Blocks(title="Empath AI — Multimodal Emotion Detection") as demo:
    gr.Markdown(
        """
        # Empath AI — Emotion Detection (Text • Face • Voice)
        Grant permission when the browser asks for **camera/microphone**.
        Nothing is stored; analysis happens in memory and the scores are shown back to you.
        """
    )

    with gr.Tab("Text"):
        t_in  = gr.Textbox(label="Enter text", lines=3, placeholder="Type something here…")
        t_btn = gr.Button("Analyze Text", variant="primary")
        t_out = gr.Label(num_top_classes=3)
        t_btn.click(analyze_text, inputs=t_in, outputs=t_out)

    with gr.Tab("Face (Webcam or Upload)"):
        i_in  = gr.Image(sources=["webcam", "upload"], type="pil", label="Webcam / Upload")
        i_btn = gr.Button("Analyze Face", variant="primary")
        i_out = gr.Label(num_top_classes=3)
        i_btn.click(analyze_face, inputs=i_in, outputs=i_out)

    with gr.Tab("Voice (Mic or Upload)"):
        a_in  = gr.Audio(sources=["microphone", "upload"], type="filepath",
                         label="Record or upload a short clip (≤30s)")
        a_btn = gr.Button("Analyze Voice", variant="primary")
        a_out = gr.Label(num_top_classes=3)
        a_btn.click(analyze_voice, inputs=a_in, outputs=a_out)

demo.launch()