Spaces:

kingkw1
/

AffectLink

Sleeping

App Files Files Community

Kevin King commited on Aug 5, 2025

Commit

e83cd54

1 Parent(s): 764dc1d

REFAC: Update requirements and enhance Streamlit app for multimodal emotion analysis

Browse files

Files changed (2) hide show

requirements.txt +3 -2
src/streamlit_app.py +85 -67

requirements.txt CHANGED Viewed

@@ -17,10 +17,11 @@ tf-keras==2.16.0
 torch==2.7.0
 torchaudio==2.7.0
-# Pin data/audio libraries for stability
 pandas==2.2.2
 numpy==1.26.4
 soundfile==0.12.1
 librosa==0.10.1
 scipy==1.13.0
-Pillow==10.3.0

 torch==2.7.0
 torchaudio==2.7.0
+# Pin data/audio libraries for stability and new features
 pandas==2.2.2
 numpy==1.26.4
 soundfile==0.12.1
 librosa==0.10.1
 scipy==1.13.0
+Pillow==10.3.0
+scikit-learn==1.4.2

src/streamlit_app.py CHANGED Viewed

@@ -11,48 +11,33 @@ import tempfile
 import cv2
 from moviepy.editor import VideoFileClip
 import time
-import shutil
-# --- Create a cross-platform, writable cache directory for all libraries ---
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['DEEPFACE_HOME'] = CACHE_DIR
 os.environ['HF_HOME'] = CACHE_DIR
-# Define paths for the pre-included model weights
-MODEL_NAME = "facial_expression_model_weights.h5"
-SOURCE_PATH = os.path.join("src", "weights", MODEL_NAME)
-DEST_DIR = os.path.join(CACHE_DIR, ".deepface", "weights")
-DEST_PATH = os.path.join(DEST_DIR, MODEL_NAME)
-# Create the destination directory if it doesn't exist and copy the model
-if not os.path.exists(DEST_PATH):
-    print(f"Model not found in cache. Copying from {SOURCE_PATH} to {DEST_PATH}...")
-    os.makedirs(DEST_DIR, exist_ok=True)
-    try:
-        shutil.copy(SOURCE_PATH, DEST_PATH)
-        print("Model copied successfully.")
-    except FileNotFoundError:
-        print(f"Warning: Local model file not found at {SOURCE_PATH}. App will attempt to download it.")
-    except Exception as e:
-        print(f"Error copying model file: {e}")
 # --- Page Configuration ---
 st.set_page_config(page_title="AffectLink Demo", page_icon="😊", layout="wide")
 st.title("AffectLink: Post-Hoc Emotion Analysis")
-st.write("Upload a short video clip (under 30 seconds) to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
 # --- Logger Configuration ---
 logging.basicConfig(level=logging.INFO)
 logging.getLogger('deepface').setLevel(logging.ERROR)
 logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
 logging.getLogger('moviepy').setLevel(logging.ERROR)
 # --- Emotion Mappings ---
-UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
@@ -68,6 +53,29 @@ def load_models():
 whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
 # --- UI and Processing Logic ---
 uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
@@ -81,9 +89,11 @@ if uploaded_file is not None:
         st.video(temp_video_path)
         if st.button("Analyze Video"):
-            facial_analysis_results = []
             audio_analysis_results = {}
             with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
                 cap = None
                 try:
@@ -97,16 +107,15 @@ if uploaded_file is not None:
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
-                                facial_analysis_results.append((timestamp, analysis[0]['dominant_emotion'].capitalize()))
                         frame_count += 1
-                except Exception as e:
-                    st.error(f"An error occurred during facial analysis: {e}")
                 finally:
                     if cap: cap.release()
             with st.spinner("Extracting and analyzing audio..."):
                 video_clip = None
-                temp_audio_path = None
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
@@ -114,65 +123,74 @@ if uploaded_file is not None:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
-                        transcribed_text = result['text'] if result['text'].strip() else "No speech detected."
-                        audio_analysis_results['Transcription'] = transcribed_text
-                        if "No speech detected" not in transcribed_text:
                             text_emotions = text_classifier(transcribed_text)[0]
-                            unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
-                            for emo in text_emotions:
-                                unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
-                                if unified_emo: unified_text_scores[unified_emo] += emo['score']
-                            audio_analysis_results['Text Emotion'] = max(unified_text_scores, key=unified_text_scores.get).capitalize()
                         audio_array, _ = sf.read(temp_audio_path, dtype='float32')
-                        if audio_array.ndim == 2:
-                            audio_array = audio_array.mean(axis=1)
-                        min_length = 1024
-                        if len(audio_array) < min_length:
-                            padding = np.zeros(min_length - len(audio_array), dtype=np.float32)
-                            audio_array = np.concatenate([audio_array, padding])
                         inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
                         with torch.no_grad():
                             logits = ser_model(**inputs).logits
                         scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
-                        unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
-                        for i, score in enumerate(scores):
-                            raw_emo = ser_model.config.id2label[i]
-                            unified_emo = SER_TO_UNIFIED.get(raw_emo)
-                            if unified_emo: unified_ser_scores[unified_emo] += score.item()
-                        audio_analysis_results['Speech Emotion'] = max(unified_ser_scores, key=unified_ser_scores.get).capitalize()
                     else:
-                        audio_analysis_results['Transcription'] = "No audio track found in the video."
-                except Exception as e:
-                    st.error(f"An error occurred during audio analysis: {e}")
                 finally:
                     if video_clip: video_clip.close()
-                    if temp_audio_path and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
             st.header("Analysis Results")
-            col1, col2 = st.columns(2)
             with col1:
-                st.subheader("Audio Analysis")
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
-                st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
-                st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
             with col2:
-                st.subheader("Facial Expression Timeline")
-                if facial_analysis_results:
-                    for timestamp, emotion in facial_analysis_results:
-                        st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
                 else:
-                    st.write("No faces detected or video processing failed.")
     finally:
         if temp_video_path and os.path.exists(temp_video_path):
-            time.sleep(1)
             try:
                 os.unlink(temp_video_path)
             except Exception:

 import cv2
 from moviepy.editor import VideoFileClip
 import time
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+# --- Create a cross-platform, writable cache directory ---
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['DEEPFACE_HOME'] = CACHE_DIR
 os.environ['HF_HOME'] = CACHE_DIR
 # --- Page Configuration ---
 st.set_page_config(page_title="AffectLink Demo", page_icon="😊", layout="wide")
 st.title("AffectLink: Post-Hoc Emotion Analysis")
+st.write("Upload a short video clip (under 30 seconds) to see a multimodal emotion analysis.")
 # --- Logger Configuration ---
+# [Logger setup remains the same]
 logging.basicConfig(level=logging.INFO)
 logging.getLogger('deepface').setLevel(logging.ERROR)
 logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
 logging.getLogger('moviepy').setLevel(logging.ERROR)
 # --- Emotion Mappings ---
+UNIFIED_EMOTIONS = ['angry', 'happy', 'sad', 'neutral'] # Defined order for vectors
 TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
 SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
+FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry'}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
 whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
+# --- Helper Functions for Analysis ---
+def create_unified_vector(scores_dict, mapping_dict):
+    """Creates a normalized vector from a dictionary of scores based on a mapping."""
+    vector = np.zeros(len(UNIFIED_EMOTIONS))
+    for label, score in scores_dict.items():
+        unified_label = mapping_dict.get(label)
+        if unified_label and unified_label in UNIFIED_EMOTIONS:
+            idx = UNIFIED_EMOTIONS.index(unified_label)
+            vector[idx] += score
+    # Normalize the vector
+    norm = np.linalg.norm(vector)
+    if norm > 0:
+        vector /= norm
+    return vector
+def get_consistency_level(cosine_sim):
+    """Convert cosine similarity to a qualitative label."""
+    if cosine_sim >= 0.8: return "High"
+    if cosine_sim >= 0.6: return "Medium"
+    if cosine_sim >= 0.3: return "Low"
+    return "Very Low"
 # --- UI and Processing Logic ---
 uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
         st.video(temp_video_path)
         if st.button("Analyze Video"):
+            # Dictionaries to hold all results
+            fer_timeline = {}
             audio_analysis_results = {}
+            # --- Video Processing ---
             with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
                 cap = None
                 try:
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                             if isinstance(analysis, list) and len(analysis) > 0:
+                                # Store the full emotion dictionary for the plot
+                                fer_timeline[timestamp] = analysis[0]['emotion']
                         frame_count += 1
                 finally:
                     if cap: cap.release()
+            # --- Audio Processing ---
             with st.spinner("Extracting and analyzing audio..."):
                 video_clip = None
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
                             video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
                             temp_audio_path = taudio.name
+                        # Run all audio models
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
+                        transcribed_text = result['text'].strip()
+                        audio_analysis_results['Transcription'] = transcribed_text if transcribed_text else "No speech detected."
+                        if transcribed_text:
                             text_emotions = text_classifier(transcribed_text)[0]
+                            audio_analysis_results['Text Emotion Scores'] = {emo['label']: emo['score'] for emo in text_emotions}
                         audio_array, _ = sf.read(temp_audio_path, dtype='float32')
+                        if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
+                        if len(audio_array) < 1024: audio_array = np.pad(audio_array, (0, 1024 - len(audio_array)))
                         inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
                         with torch.no_grad():
                             logits = ser_model(**inputs).logits
                         scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
+                        ser_scores = {ser_model.config.id2label[i]: score.item() for i, score in enumerate(scores)}
+                        audio_analysis_results['Speech Emotion Scores'] = ser_scores
                     else:
+                        audio_analysis_results['Transcription'] = "No audio track found."
                 finally:
                     if video_clip: video_clip.close()
+                    if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
+            # --- Post-Analysis and Visualization ---
             st.header("Analysis Results")
+            # Prepare data for display
+            fer_avg_scores = pd.DataFrame(fer_timeline).T.mean().to_dict() if fer_timeline else {}
+            ser_scores = audio_analysis_results.get('Speech Emotion Scores', {})
+            text_scores = audio_analysis_results.get('Text Emotion Scores', {})
+            # Create vectors for cosine similarity
+            fer_vector = create_unified_vector(fer_avg_scores, FACIAL_TO_UNIFIED)
+            ser_vector = create_unified_vector(ser_scores, SER_TO_UNIFIED)
+            text_vector = create_unified_vector(text_scores, TEXT_TO_UNIFIED)
+            # Calculate similarities
+            sim_face_text = cosine_similarity([fer_vector], [text_vector])[0][0]
+            sim_face_speech = cosine_similarity([fer_vector], [ser_vector])[0][0]
+            sim_speech_text = cosine_similarity([ser_vector], [text_vector])[0][0]
+            avg_similarity = np.mean([sim_face_text, sim_face_speech, sim_speech_text])
+            # Display metrics
+            col1, col2 = st.columns([1, 2])
             with col1:
+                st.subheader("Multimodal Summary")
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
+                st.metric("Dominant Facial Emotion", max(fer_avg_scores, key=fer_avg_scores.get).capitalize() if fer_avg_scores else "N/A")
+                st.metric("Dominant Text Emotion", max(text_scores, key=lambda k: TEXT_TO_UNIFIED.get(k) is not None and text_scores.get(k) or -1).capitalize() if text_scores else "N/A")
+                st.metric("Dominant Speech Emotion", max(ser_scores, key=lambda k: SER_TO_UNIFIED.get(k) is not None and ser_scores.get(k) or -1).capitalize() if ser_scores else "N/A")
+                st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
             with col2:
+                st.subheader("Facial Emotion Over Time")
+                if fer_timeline:
+                    # Convert timeline to a DataFrame suitable for st.line_chart
+                    df = pd.DataFrame(fer_timeline).T
+                    # Filter for only the unified emotions we care about
+                    df_filtered = df[list(FACIAL_TO_UNIFIED.keys())].rename(columns=FACIAL_TO_UNIFIED)
+                    st.line_chart(df_filtered)
                 else:
+                    st.write("No faces detected to plot.")
     finally:
         if temp_video_path and os.path.exists(temp_video_path):
+            time.sleep(1)
             try:
                 os.unlink(temp_video_path)
             except Exception: