Spaces:

kingkw1
/

AffectLink

Sleeping

App Files Files Community

Kevin King commited on Aug 5

Commit

377a152

1 Parent(s): b2395f1

Refactor video processing logic to improve error handling and resource cleanup in Streamlit app

Browse files

Files changed (1) hide show

src/streamlit_app.py +34 -41

src/streamlit_app.py CHANGED Viewed

@@ -8,25 +8,21 @@ from deepface import DeepFace
 import logging
 import soundfile as sf
 import tempfile
-from PIL import Image
 import cv2
 from moviepy.editor import VideoFileClip
-# --- THIS IS THE FIX for Error 1 ---
 # Create a cross-platform, writable cache directory for all libraries
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['DEEPFACE_HOME'] = CACHE_DIR
 os.environ['HF_HOME'] = CACHE_DIR
-# ====================================
 # --- Page Configuration ---
-st.set_page_config(
-    page_title="AffectLink Demo",
-    page_icon="😊",
-    layout="wide"
-)
 st.title("AffectLink: Post-Hoc Emotion Analysis")
 st.write("Upload a short video clip (under 30 seconds) to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
@@ -38,13 +34,8 @@ logging.getLogger('moviepy').setLevel(logging.ERROR)
 # --- Emotion Mappings ---
 UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
-TEXT_TO_UNIFIED = {
-    'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
-    'fear': None, 'surprise': None, 'disgust': None
-}
-SER_TO_UNIFIED = {
-    'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
-}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
@@ -75,20 +66,17 @@ if uploaded_file is not None:
         if st.button("Analyze Video"):
             facial_analysis_results = []
             audio_analysis_results = {}
-            cap = None # Initialize cap to None
             # --- Video Processing ---
             with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
                 try:
                     cap = cv2.VideoCapture(temp_video_path)
-                    fps = cap.get(cv2.CAP_PROP_FPS)
-                    if fps == 0:
-                        fps = 30
                     frame_count = 0
                     while cap.isOpened():
                         ret, frame = cap.read()
-                        if not ret:
-                            break
                         if frame_count % int(fps) == 0:
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
@@ -98,12 +86,12 @@ if uploaded_file is not None:
                 except Exception as e:
                     st.error(f"An error occurred during facial analysis: {e}")
                 finally:
-                    if cap:
-                        cap.release() # --- THIS IS PART of the FIX for Error 3 ---
             # --- Audio Processing ---
             with st.spinner("Extracting and analyzing audio..."):
                 video_clip = None
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
@@ -112,7 +100,7 @@ if uploaded_file is not None:
                             temp_audio_path = taudio.name
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
-                        transcribed_text = result['text'] if result['text'] else "No speech detected."
                         audio_analysis_results['Transcription'] = transcribed_text
                         if "No speech detected" not in transcribed_text:
@@ -120,18 +108,17 @@ if uploaded_file is not None:
                             unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
                             for emo in text_emotions:
                                 unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
-                                if unified_emo:
-                                    unified_text_scores[unified_emo] += emo['score']
                             audio_analysis_results['Text Emotion'] = max(unified_text_scores, key=unified_text_scores.get).capitalize()
                         audio_array, _ = sf.read(temp_audio_path)
-                        # --- THIS IS THE FIX for Error 2 ---
-                        min_length = 400 # Minimum samples for the SER model
                         if len(audio_array) < min_length:
-                            padding = np.zeros(min_length - len(audio_array))
                             audio_array = np.concatenate([audio_array, padding])
-                        # =====================================
                         inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
                         with torch.no_grad():
@@ -141,18 +128,16 @@ if uploaded_file is not None:
                         for i, score in enumerate(scores):
                             raw_emo = ser_model.config.id2label[i]
                             unified_emo = SER_TO_UNIFIED.get(raw_emo)
-                            if unified_emo:
-                                unified_ser_scores[unified_emo] += score.item()
                         audio_analysis_results['Speech Emotion'] = max(unified_ser_scores, key=unified_ser_scores.get).capitalize()
-                        os.unlink(temp_audio_path)
                     else:
                         audio_analysis_results['Transcription'] = "No audio track found in the video."
                 except Exception as e:
                     st.error(f"An error occurred during audio analysis: {e}")
                 finally:
-                    if video_clip:
-                        video_clip.close() # --- THIS IS PART of the FIX for Error 3 ---
             # --- Display Results ---
             st.header("Analysis Results")
@@ -169,9 +154,17 @@ if uploaded_file is not None:
                         st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
                 else:
                     st.write("No faces detected or video processing failed.")
     finally:
-        # --- THIS IS THE FINAL PART of the FIX for Error 3 ---
-        # Clean up the temporary video file in a finally block to ensure it runs
         if temp_video_path and os.path.exists(temp_video_path):
-            os.unlink(temp_video_path)

 import logging
 import soundfile as sf
 import tempfile
 import cv2
 from moviepy.editor import VideoFileClip
+import time
+# --- THIS IS THE FIX for Error 1 (Local Pathing) ---
 # Create a cross-platform, writable cache directory for all libraries
+# On Windows, this will be in AppData\Local\Temp. On Linux (HF Spaces), it will be in /tmp
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['DEEPFACE_HOME'] = CACHE_DIR
 os.environ['HF_HOME'] = CACHE_DIR
+# ===================================================
 # --- Page Configuration ---
+st.set_page_config(page_title="AffectLink Demo", page_icon="😊", layout="wide")
 st.title("AffectLink: Post-Hoc Emotion Analysis")
 st.write("Upload a short video clip (under 30 seconds) to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
 # --- Emotion Mappings ---
 UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
+TEXT_TO_UNIFIED = {'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry'}
+SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'}
 AUDIO_SAMPLE_RATE = 16000
 # --- Model Loading ---
         if st.button("Analyze Video"):
             facial_analysis_results = []
             audio_analysis_results = {}
             # --- Video Processing ---
             with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
+                cap = None
                 try:
                     cap = cv2.VideoCapture(temp_video_path)
+                    fps = cap.get(cv2.CAP_PROP_FPS) or 30
                     frame_count = 0
                     while cap.isOpened():
                         ret, frame = cap.read()
+                        if not ret: break
                         if frame_count % int(fps) == 0:
                             timestamp = frame_count / fps
                             analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
                 except Exception as e:
                     st.error(f"An error occurred during facial analysis: {e}")
                 finally:
+                    if cap: cap.release()
             # --- Audio Processing ---
             with st.spinner("Extracting and analyzing audio..."):
                 video_clip = None
+                temp_audio_path = None
                 try:
                     video_clip = VideoFileClip(temp_video_path)
                     if video_clip.audio:
                             temp_audio_path = taudio.name
                         result = whisper_model.transcribe(temp_audio_path, fp16=False)
+                        transcribed_text = result['text'] if result['text'].strip() else "No speech detected."
                         audio_analysis_results['Transcription'] = transcribed_text
                         if "No speech detected" not in transcribed_text:
                             unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
                             for emo in text_emotions:
                                 unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
+                                if unified_emo: unified_text_scores[unified_emo] += emo['score']
                             audio_analysis_results['Text Emotion'] = max(unified_text_scores, key=unified_text_scores.get).capitalize()
                         audio_array, _ = sf.read(temp_audio_path)
+                        # --- FIX for Error 2 ---
+                        min_length = 400
                         if len(audio_array) < min_length:
+                            padding = np.zeros(min_length - len(audio_array), dtype=np.float32)
                             audio_array = np.concatenate([audio_array, padding])
+                        # ======================
                         inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
                         with torch.no_grad():
                         for i, score in enumerate(scores):
                             raw_emo = ser_model.config.id2label[i]
                             unified_emo = SER_TO_UNIFIED.get(raw_emo)
+                            if unified_emo: unified_ser_scores[unified_emo] += score.item()
                         audio_analysis_results['Speech Emotion'] = max(unified_ser_scores, key=unified_ser_scores.get).capitalize()
                     else:
                         audio_analysis_results['Transcription'] = "No audio track found in the video."
                 except Exception as e:
                     st.error(f"An error occurred during audio analysis: {e}")
                 finally:
+                    if video_clip: video_clip.close()
+                    if temp_audio_path and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
             # --- Display Results ---
             st.header("Analysis Results")
                         st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
                 else:
                     st.write("No faces detected or video processing failed.")
     finally:
+        # --- FIX for Error 3 ---
+        # Ensure the video file is released before attempting to delete it
+        # This block runs after the 'Analyze Video' button logic completes
         if temp_video_path and os.path.exists(temp_video_path):
+            # A small delay can sometimes help ensure file locks are released
+            time.sleep(1)
+            try:
+                os.unlink(temp_video_path)
+            except PermissionError:
+                st.warning(f"Could not delete temporary video file. It may still be in use: {temp_video_path}")
+            except Exception as e:
+                st.warning(f"An error occurred while deleting the temporary video file: {e}")