Spaces:

kingkw1
/

AffectLink

Sleeping

App Files Files Community

Kevin King commited on Aug 5

Commit

b2395f1

1 Parent(s): 555549c

Fix caching directory setup and improve video/audio processing error handling in Streamlit app

Browse files

Files changed (1) hide show

src/streamlit_app.py +115 -112

src/streamlit_app.py CHANGED Viewed

@@ -12,9 +12,13 @@ from PIL import Image
 import cv2
 from moviepy.editor import VideoFileClip
-# Set home directories for model caching to the writable /tmp folder
-os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
-os.environ['HF_HOME'] = '/tmp/huggingface'
 # --- Page Configuration ---
 st.set_page_config(
@@ -47,12 +51,11 @@ AUDIO_SAMPLE_RATE = 16000
 @st.cache_resource
 def load_models():
     with st.spinner("Loading AI models, this may take a moment..."):
-        whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
         text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
         ser_model_name = "superb/hubert-large-superb-er"
         ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
         ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
-        # DeepFace loads its own models on first use, no need to preload here.
         return whisper_model, text_classifier, ser_model, ser_feature_extractor
 whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
@@ -61,114 +64,114 @@ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
 uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
 if uploaded_file is not None:
-    # Save the uploaded file to a temporary location
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
-        tfile.write(uploaded_file.read())
-        temp_video_path = tfile.name
-    st.video(temp_video_path)
-    if st.button("Analyze Video"):
-        facial_analysis_results = []
-        audio_analysis_results = {}
-        # --- Video Processing for Facial Emotion ---
-        with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
-            try:
-                cap = cv2.VideoCapture(temp_video_path)
-                fps = cap.get(cv2.CAP_PROP_FPS)
-                if fps == 0:
-                    fps = 30 # Default to 30 fps if not available
-                frame_count = 0
-                while cap.isOpened():
-                    ret, frame = cap.read()
-                    if not ret:
-                        break
-                    if frame_count % int(fps) == 0:
-                        timestamp = frame_count / fps
-                        analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
-                        if isinstance(analysis, list) and len(analysis) > 0:
-                            dominant_emotion = analysis[0]['dominant_emotion']
-                            facial_analysis_results.append((timestamp, dominant_emotion.capitalize()))
-                    frame_count += 1
-                cap.release()
-            except Exception as e:
-                st.error(f"An error occurred during facial analysis: {e}")
-        # --- Audio Extraction and Processing ---
-        with st.spinner("Extracting and analyzing audio..."):
-            temp_audio_path = None
-            video_clip = None
-            try:
-                video_clip = VideoFileClip(temp_video_path)
-                if video_clip.audio is not None:
-                    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
-                        video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
-                        temp_audio_path = taudio.name
-                    # 1. Speech-to-Text (Whisper)
-                    result = whisper_model.transcribe(temp_audio_path, fp16=False)
-                    transcribed_text = result['text'] if result['text'] else "No speech detected."
-                    audio_analysis_results['Transcription'] = transcribed_text
-                    # 2. Text-based Emotion
-                    if "No speech detected" not in transcribed_text:
-                        text_emotions = text_classifier(transcribed_text)[0]
-                        unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
-                        for emo in text_emotions:
-                            unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
                             if unified_emo:
-                                unified_text_scores[unified_emo] += emo['score']
-                        dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
-                        audio_analysis_results['Text Emotion'] = dominant_text_emotion.capitalize()
-                    # 3. Speech Emotion Recognition (SER)
-                    audio_array, _ = sf.read(temp_audio_path)
-                    inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
-                    with torch.no_grad():
-                        logits = ser_model(**inputs).logits
-                    scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
-                    unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
-                    for i, score in enumerate(scores):
-                        raw_emo = ser_model.config.id2label[i]
-                        unified_emo = SER_TO_UNIFIED.get(raw_emo)
-                        if unified_emo:
-                            unified_ser_scores[unified_emo] += score.item()
-                    dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
-                    audio_analysis_results['Speech Emotion'] = dominant_ser_emotion.capitalize()
-                else:
-                    audio_analysis_results['Transcription'] = "No audio track found in the video."
-            except Exception as e:
-                st.error(f"An error occurred during audio analysis: {e}")
-            finally:
-                if video_clip:
-                    video_clip.close()
-                if temp_audio_path and os.path.exists(temp_audio_path):
-                    os.unlink(temp_audio_path)
-        # --- Display Results ---
-        st.header("Analysis Results")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("Audio Analysis")
-            if audio_analysis_results:
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
                 st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
                 st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
-            else:
-                st.write("No audio results to display.")
-        with col2:
-            st.subheader("Facial Expression Timeline")
-            if facial_analysis_results:
-                for timestamp, emotion in facial_analysis_results:
-                    st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
-            else:
-                st.write("No faces detected or video processing failed.")
-    # Clean up temp video file after analysis is done
-    if os.path.exists(temp_video_path):
-        os.unlink(temp_video_path)

 import cv2
 from moviepy.editor import VideoFileClip
+# --- THIS IS THE FIX for Error 1 ---
+# Create a cross-platform, writable cache directory for all libraries
+CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+os.environ['DEEPFACE_HOME'] = CACHE_DIR
+os.environ['HF_HOME'] = CACHE_DIR
+# ====================================
 # --- Page Configuration ---
 st.set_page_config(
 @st.cache_resource
 def load_models():
     with st.spinner("Loading AI models, this may take a moment..."):
+        whisper_model = whisper.load_model("base", download_root=os.path.join(CACHE_DIR, "whisper"))
         text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
         ser_model_name = "superb/hubert-large-superb-er"
         ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
         ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
         return whisper_model, text_classifier, ser_model, ser_feature_extractor
 whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
 uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
 if uploaded_file is not None:
+    temp_video_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
+            tfile.write(uploaded_file.read())
+            temp_video_path = tfile.name
+        st.video(temp_video_path)
+        if st.button("Analyze Video"):
+            facial_analysis_results = []
+            audio_analysis_results = {}
+            cap = None # Initialize cap to None
+            # --- Video Processing ---
+            with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
+                try:
+                    cap = cv2.VideoCapture(temp_video_path)
+                    fps = cap.get(cv2.CAP_PROP_FPS)
+                    if fps == 0:
+                        fps = 30
+                    frame_count = 0
+                    while cap.isOpened():
+                        ret, frame = cap.read()
+                        if not ret:
+                            break
+                        if frame_count % int(fps) == 0:
+                            timestamp = frame_count / fps
+                            analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
+                            if isinstance(analysis, list) and len(analysis) > 0:
+                                facial_analysis_results.append((timestamp, analysis[0]['dominant_emotion'].capitalize()))
+                        frame_count += 1
+                except Exception as e:
+                    st.error(f"An error occurred during facial analysis: {e}")
+                finally:
+                    if cap:
+                        cap.release() # --- THIS IS PART of the FIX for Error 3 ---
+            # --- Audio Processing ---
+            with st.spinner("Extracting and analyzing audio..."):
+                video_clip = None
+                try:
+                    video_clip = VideoFileClip(temp_video_path)
+                    if video_clip.audio:
+                        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
+                            video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
+                            temp_audio_path = taudio.name
+                        result = whisper_model.transcribe(temp_audio_path, fp16=False)
+                        transcribed_text = result['text'] if result['text'] else "No speech detected."
+                        audio_analysis_results['Transcription'] = transcribed_text
+                        if "No speech detected" not in transcribed_text:
+                            text_emotions = text_classifier(transcribed_text)[0]
+                            unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
+                            for emo in text_emotions:
+                                unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
+                                if unified_emo:
+                                    unified_text_scores[unified_emo] += emo['score']
+                            audio_analysis_results['Text Emotion'] = max(unified_text_scores, key=unified_text_scores.get).capitalize()
+                        audio_array, _ = sf.read(temp_audio_path)
+                        # --- THIS IS THE FIX for Error 2 ---
+                        min_length = 400 # Minimum samples for the SER model
+                        if len(audio_array) < min_length:
+                            padding = np.zeros(min_length - len(audio_array))
+                            audio_array = np.concatenate([audio_array, padding])
+                        # =====================================
+                        inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
+                        with torch.no_grad():
+                            logits = ser_model(**inputs).logits
+                        scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
+                        unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
+                        for i, score in enumerate(scores):
+                            raw_emo = ser_model.config.id2label[i]
+                            unified_emo = SER_TO_UNIFIED.get(raw_emo)
                             if unified_emo:
+                                unified_ser_scores[unified_emo] += score.item()
+                        audio_analysis_results['Speech Emotion'] = max(unified_ser_scores, key=unified_ser_scores.get).capitalize()
+                        os.unlink(temp_audio_path)
+                    else:
+                        audio_analysis_results['Transcription'] = "No audio track found in the video."
+                except Exception as e:
+                    st.error(f"An error occurred during audio analysis: {e}")
+                finally:
+                    if video_clip:
+                        video_clip.close() # --- THIS IS PART of the FIX for Error 3 ---
+            # --- Display Results ---
+            st.header("Analysis Results")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("Audio Analysis")
                 st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
                 st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
                 st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
+            with col2:
+                st.subheader("Facial Expression Timeline")
+                if facial_analysis_results:
+                    for timestamp, emotion in facial_analysis_results:
+                        st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
+                else:
+                    st.write("No faces detected or video processing failed.")
+    finally:
+        # --- THIS IS THE FINAL PART of the FIX for Error 3 ---
+        # Clean up the temporary video file in a finally block to ensure it runs
+        if temp_video_path and os.path.exists(temp_video_path):
+            os.unlink(temp_video_path)