Spaces:

kingkw1
/

AffectLink

Sleeping

App Files Files Community

Kevin King commited on Aug 5, 2025

Commit

d5ac657

1 Parent(s): e83cd54

REFAC: Improve emotion mapping and display logic in Streamlit app

Browse files

Files changed (1) hide show

src/streamlit_app.py +24 -15

src/streamlit_app.py CHANGED Viewed

@@ -26,7 +26,6 @@ st.title("AffectLink: Post-Hoc Emotion Analysis")
 st.write("Upload a short video clip (under 30 seconds) to see a multimodal emotion analysis.")
 # --- Logger Configuration ---
-# [Logger setup remains the same]
 logging.basicConfig(level=logging.INFO)
 logging.getLogger('deepface').setLevel(logging.ERROR)
 logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
@@ -34,10 +33,11 @@ logging.getLogger('moviepy').setLevel(logging.ERROR)
 # --- Emotion Mappings ---
-UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral'] # Defined order for vectors
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
-FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry'}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
@@ -58,12 +58,12 @@ def create_unified_vector(scores_dict, mapping_dict):
     """Creates a normalized vector from a dictionary of scores based on a mapping."""
     vector = np.zeros(len(UNIFIED_EMOTIONS))
     for label, score in scores_dict.items():
         unified_label = mapping_dict.get(label)
-        if unified_label and unified_label in UNIFIED_EMOTIONS:
             idx = UNIFIED_EMOTIONS.index(unified_label)
             vector[idx] += score
-    # Normalize the vector
     norm = np.linalg.norm(vector)
     if norm > 0:
         vector /= norm
@@ -107,7 +107,6 @@ if uploaded_file is not None:
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
-                                # Store the full emotion dictionary for the plot
                                 fer_timeline[timestamp] = analysis[0]['emotion']
                         frame_count += 1
                 finally:
@@ -123,7 +122,6 @@ if uploaded_file is not None:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
-                        # Run all audio models
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
                         transcribed_text = result['text'].strip()
                         audio_analysis_results['Transcription'] = transcribed_text if transcribed_text else "No speech detected."
@@ -156,7 +154,7 @@ if uploaded_file is not None:
             ser_scores = audio_analysis_results.get('Speech Emotion Scores', {})
             text_scores = audio_analysis_results.get('Text Emotion Scores', {})
-            # Create vectors for cosine similarity
             fer_vector = create_unified_vector(fer_avg_scores, FACIAL_TO_UNIFIED)
             ser_vector = create_unified_vector(ser_scores, SER_TO_UNIFIED)
             text_vector = create_unified_vector(text_scores, TEXT_TO_UNIFIED)
@@ -167,24 +165,35 @@ if uploaded_file is not None:
             sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
             avg_similarity = np.mean([sim_face_text, sim_face_speech, sim_speech_text])
             # Display metrics
             col1, col2 = st.columns([1, 2])
             with col1:
                 st.subheader("Multimodal Summary")
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
-                st.metric("Dominant Facial Emotion", max(fer_avg_scores, key=fer_avg_scores.get).capitalize() if fer_avg_scores else "N/A")
-                st.metric("Dominant Text Emotion", max(text_scores, key=lambda k: TEXT_TO_UNIFIED.get(k) is not None and text_scores.get(k) or -1).capitalize() if text_scores else "N/A")
-                st.metric("Dominant Speech Emotion", max(ser_scores, key=lambda k: SER_TO_UNIFIED.get(k) is not None and ser_scores.get(k) or -1).capitalize() if ser_scores else "N/A")
                 st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
             with col2:
                 st.subheader("Facial Emotion Over Time")
                 if fer_timeline:
-                    # Convert timeline to a DataFrame suitable for st.line_chart
                     df = pd.DataFrame(fer_timeline).T
-                    # Filter for only the unified emotions we care about
-                    df_filtered = df[list(FACIAL_TO_UNIFIED.keys())].rename(columns=FACIAL_TO_UNIFIED)
-                    st.line_chart(df_filtered)
                 else:
                     st.write("No faces detected to plot.")

 st.write("Upload a short video clip (under 30 seconds) to see a multimodal emotion analysis.")
 # --- Logger Configuration ---
 logging.basicConfig(level=logging.INFO)
 logging.getLogger('deepface').setLevel(logging.ERROR)
 logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
 # --- Emotion Mappings ---
+# This is the single source of truth for our final emotion space
+UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
+FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
     """Creates a normalized vector from a dictionary of scores based on a mapping."""
     vector = np.zeros(len(UNIFIED_EMOTIONS))
     for label, score in scores_dict.items():
+        # Map the raw label (e.g., 'neu', 'joy') to our unified label ('neutral', 'happy')
         unified_label = mapping_dict.get(label)
+        if unified_label in UNIFIED_EMOTIONS:
             idx = UNIFIED_EMOTIONS.index(unified_label)
             vector[idx] += score
     norm = np.linalg.norm(vector)
     if norm > 0:
         vector /= norm
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
                                 fer_timeline[timestamp] = analysis[0]['emotion']
                         frame_count += 1
                 finally:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
                         transcribed_text = result['text'].strip()
                         audio_analysis_results['Transcription'] = transcribed_text if transcribed_text else "No speech detected."
             ser_scores = audio_analysis_results.get('Speech Emotion Scores', {})
             text_scores = audio_analysis_results.get('Text Emotion Scores', {})
+            # Create vectors using the unified mappings. This ensures cosine similarity is correct.
             fer_vector = create_unified_vector(fer_avg_scores, FACIAL_TO_UNIFIED)
             ser_vector = create_unified_vector(ser_scores, SER_TO_UNIFIED)
             text_vector = create_unified_vector(text_scores, TEXT_TO_UNIFIED)
             sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
             avg_similarity = np.mean([sim_face_text, sim_face_speech, sim_speech_text])
+            # --- THIS IS THE FIX: Map dominant emotions to unified labels before displaying ---
+            dominant_fer = max(fer_avg_scores, key=fer_avg_scores.get) if fer_avg_scores else "N/A"
+            dominant_text_raw = max(text_scores, key=text_scores.get) if text_scores else "N/A"
+            dominant_ser_raw = max(ser_scores, key=ser_scores.get) if ser_scores else "N/A"
+            # Convert raw dominant emotions to their unified, full-word versions for display
+            display_fer = FACIAL_TO_UNIFIED.get(dominant_fer, "N/A").capitalize()
+            display_text = TEXT_TO_UNIFIED.get(dominant_text_raw, "N/A").capitalize()
+            display_ser = SER_TO_UNIFIED.get(dominant_ser_raw, "N/A").capitalize()
+            # ===================================================================================
             # Display metrics
             col1, col2 = st.columns([1, 2])
             with col1:
                 st.subheader("Multimodal Summary")
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
+                st.metric("Dominant Facial Emotion", display_fer)
+                st.metric("Dominant Text Emotion", display_text)
+                st.metric("Dominant Speech Emotion", display_ser)
                 st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
             with col2:
                 st.subheader("Facial Emotion Over Time")
                 if fer_timeline:
                     df = pd.DataFrame(fer_timeline).T
+                    # Filter for only the unified emotions we care about for the plot
+                    plot_columns = [k for k, v in FACIAL_TO_UNIFIED.items() if v is not None]
+                    df_filtered = df[plot_columns].rename(columns=FACIAL_TO_UNIFIED)
+                    st.line_chart(df_filtered[UNIFIED_EMOTIONS]) # Ensure consistent column order
                 else:
                     st.write("No faces detected to plot.")