Kevin King commited on
Commit
b2395f1
·
1 Parent(s): 555549c

Fix caching directory setup and improve video/audio processing error handling in Streamlit app

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +115 -112
src/streamlit_app.py CHANGED
@@ -12,9 +12,13 @@ from PIL import Image
12
  import cv2
13
  from moviepy.editor import VideoFileClip
14
 
15
- # Set home directories for model caching to the writable /tmp folder
16
- os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
17
- os.environ['HF_HOME'] = '/tmp/huggingface'
 
 
 
 
18
 
19
  # --- Page Configuration ---
20
  st.set_page_config(
@@ -47,12 +51,11 @@ AUDIO_SAMPLE_RATE = 16000
47
  @st.cache_resource
48
  def load_models():
49
  with st.spinner("Loading AI models, this may take a moment..."):
50
- whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
51
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
52
  ser_model_name = "superb/hubert-large-superb-er"
53
  ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
54
  ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
55
- # DeepFace loads its own models on first use, no need to preload here.
56
  return whisper_model, text_classifier, ser_model, ser_feature_extractor
57
 
58
  whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
@@ -61,114 +64,114 @@ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
61
  uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
62
 
63
  if uploaded_file is not None:
64
- # Save the uploaded file to a temporary location
65
- with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
66
- tfile.write(uploaded_file.read())
67
- temp_video_path = tfile.name
68
-
69
- st.video(temp_video_path)
70
-
71
- if st.button("Analyze Video"):
72
- facial_analysis_results = []
73
- audio_analysis_results = {}
74
-
75
- # --- Video Processing for Facial Emotion ---
76
- with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
77
- try:
78
- cap = cv2.VideoCapture(temp_video_path)
79
- fps = cap.get(cv2.CAP_PROP_FPS)
80
- if fps == 0:
81
- fps = 30 # Default to 30 fps if not available
82
- frame_count = 0
83
- while cap.isOpened():
84
- ret, frame = cap.read()
85
- if not ret:
86
- break
87
-
88
- if frame_count % int(fps) == 0:
89
- timestamp = frame_count / fps
90
- analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
91
- if isinstance(analysis, list) and len(analysis) > 0:
92
- dominant_emotion = analysis[0]['dominant_emotion']
93
- facial_analysis_results.append((timestamp, dominant_emotion.capitalize()))
94
- frame_count += 1
95
- cap.release()
96
- except Exception as e:
97
- st.error(f"An error occurred during facial analysis: {e}")
98
-
99
- # --- Audio Extraction and Processing ---
100
- with st.spinner("Extracting and analyzing audio..."):
101
- temp_audio_path = None
102
- video_clip = None
103
- try:
104
- video_clip = VideoFileClip(temp_video_path)
105
- if video_clip.audio is not None:
106
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
107
- video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
108
- temp_audio_path = taudio.name
109
-
110
- # 1. Speech-to-Text (Whisper)
111
- result = whisper_model.transcribe(temp_audio_path, fp16=False)
112
- transcribed_text = result['text'] if result['text'] else "No speech detected."
113
- audio_analysis_results['Transcription'] = transcribed_text
114
-
115
- # 2. Text-based Emotion
116
- if "No speech detected" not in transcribed_text:
117
- text_emotions = text_classifier(transcribed_text)[0]
118
- unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
119
- for emo in text_emotions:
120
- unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  if unified_emo:
122
- unified_text_scores[unified_emo] += emo['score']
123
- dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
124
- audio_analysis_results['Text Emotion'] = dominant_text_emotion.capitalize()
125
-
126
- # 3. Speech Emotion Recognition (SER)
127
- audio_array, _ = sf.read(temp_audio_path)
128
- inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
129
- with torch.no_grad():
130
- logits = ser_model(**inputs).logits
131
- scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
132
- unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
133
- for i, score in enumerate(scores):
134
- raw_emo = ser_model.config.id2label[i]
135
- unified_emo = SER_TO_UNIFIED.get(raw_emo)
136
- if unified_emo:
137
- unified_ser_scores[unified_emo] += score.item()
138
- dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
139
- audio_analysis_results['Speech Emotion'] = dominant_ser_emotion.capitalize()
140
- else:
141
- audio_analysis_results['Transcription'] = "No audio track found in the video."
142
-
143
- except Exception as e:
144
- st.error(f"An error occurred during audio analysis: {e}")
145
- finally:
146
- if video_clip:
147
- video_clip.close()
148
- if temp_audio_path and os.path.exists(temp_audio_path):
149
- os.unlink(temp_audio_path)
150
-
151
- # --- Display Results ---
152
- st.header("Analysis Results")
153
- col1, col2 = st.columns(2)
154
-
155
- with col1:
156
- st.subheader("Audio Analysis")
157
- if audio_analysis_results:
158
  st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
159
  st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
160
  st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
161
- else:
162
- st.write("No audio results to display.")
163
-
164
- with col2:
165
- st.subheader("Facial Expression Timeline")
166
- if facial_analysis_results:
167
- for timestamp, emotion in facial_analysis_results:
168
- st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
169
- else:
170
- st.write("No faces detected or video processing failed.")
171
-
172
- # Clean up temp video file after analysis is done
173
- if os.path.exists(temp_video_path):
174
- os.unlink(temp_video_path)
 
12
  import cv2
13
  from moviepy.editor import VideoFileClip
14
 
15
+ # --- THIS IS THE FIX for Error 1 ---
16
+ # Create a cross-platform, writable cache directory for all libraries
17
+ CACHE_DIR = os.path.join(tempfile.gettempdir(), "affectlink_cache")
18
+ os.makedirs(CACHE_DIR, exist_ok=True)
19
+ os.environ['DEEPFACE_HOME'] = CACHE_DIR
20
+ os.environ['HF_HOME'] = CACHE_DIR
21
+ # ====================================
22
 
23
  # --- Page Configuration ---
24
  st.set_page_config(
 
51
  @st.cache_resource
52
  def load_models():
53
  with st.spinner("Loading AI models, this may take a moment..."):
54
+ whisper_model = whisper.load_model("base", download_root=os.path.join(CACHE_DIR, "whisper"))
55
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
56
  ser_model_name = "superb/hubert-large-superb-er"
57
  ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
58
  ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
 
59
  return whisper_model, text_classifier, ser_model, ser_feature_extractor
60
 
61
  whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
 
64
  uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
65
 
66
  if uploaded_file is not None:
67
+ temp_video_path = None
68
+ try:
69
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
70
+ tfile.write(uploaded_file.read())
71
+ temp_video_path = tfile.name
72
+
73
+ st.video(temp_video_path)
74
+
75
+ if st.button("Analyze Video"):
76
+ facial_analysis_results = []
77
+ audio_analysis_results = {}
78
+ cap = None # Initialize cap to None
79
+
80
+ # --- Video Processing ---
81
+ with st.spinner("Analyzing video for facial expressions... (1 frame per second)"):
82
+ try:
83
+ cap = cv2.VideoCapture(temp_video_path)
84
+ fps = cap.get(cv2.CAP_PROP_FPS)
85
+ if fps == 0:
86
+ fps = 30
87
+ frame_count = 0
88
+ while cap.isOpened():
89
+ ret, frame = cap.read()
90
+ if not ret:
91
+ break
92
+ if frame_count % int(fps) == 0:
93
+ timestamp = frame_count / fps
94
+ analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
95
+ if isinstance(analysis, list) and len(analysis) > 0:
96
+ facial_analysis_results.append((timestamp, analysis[0]['dominant_emotion'].capitalize()))
97
+ frame_count += 1
98
+ except Exception as e:
99
+ st.error(f"An error occurred during facial analysis: {e}")
100
+ finally:
101
+ if cap:
102
+ cap.release() # --- THIS IS PART of the FIX for Error 3 ---
103
+
104
+ # --- Audio Processing ---
105
+ with st.spinner("Extracting and analyzing audio..."):
106
+ video_clip = None
107
+ try:
108
+ video_clip = VideoFileClip(temp_video_path)
109
+ if video_clip.audio:
110
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
111
+ video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
112
+ temp_audio_path = taudio.name
113
+
114
+ result = whisper_model.transcribe(temp_audio_path, fp16=False)
115
+ transcribed_text = result['text'] if result['text'] else "No speech detected."
116
+ audio_analysis_results['Transcription'] = transcribed_text
117
+
118
+ if "No speech detected" not in transcribed_text:
119
+ text_emotions = text_classifier(transcribed_text)[0]
120
+ unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
121
+ for emo in text_emotions:
122
+ unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
123
+ if unified_emo:
124
+ unified_text_scores[unified_emo] += emo['score']
125
+ audio_analysis_results['Text Emotion'] = max(unified_text_scores, key=unified_text_scores.get).capitalize()
126
+
127
+ audio_array, _ = sf.read(temp_audio_path)
128
+
129
+ # --- THIS IS THE FIX for Error 2 ---
130
+ min_length = 400 # Minimum samples for the SER model
131
+ if len(audio_array) < min_length:
132
+ padding = np.zeros(min_length - len(audio_array))
133
+ audio_array = np.concatenate([audio_array, padding])
134
+ # =====================================
135
+
136
+ inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
137
+ with torch.no_grad():
138
+ logits = ser_model(**inputs).logits
139
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
140
+ unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
141
+ for i, score in enumerate(scores):
142
+ raw_emo = ser_model.config.id2label[i]
143
+ unified_emo = SER_TO_UNIFIED.get(raw_emo)
144
  if unified_emo:
145
+ unified_ser_scores[unified_emo] += score.item()
146
+ audio_analysis_results['Speech Emotion'] = max(unified_ser_scores, key=unified_ser_scores.get).capitalize()
147
+ os.unlink(temp_audio_path)
148
+ else:
149
+ audio_analysis_results['Transcription'] = "No audio track found in the video."
150
+
151
+ except Exception as e:
152
+ st.error(f"An error occurred during audio analysis: {e}")
153
+ finally:
154
+ if video_clip:
155
+ video_clip.close() # --- THIS IS PART of the FIX for Error 3 ---
156
+
157
+ # --- Display Results ---
158
+ st.header("Analysis Results")
159
+ col1, col2 = st.columns(2)
160
+ with col1:
161
+ st.subheader("Audio Analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
163
  st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
164
  st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
165
+ with col2:
166
+ st.subheader("Facial Expression Timeline")
167
+ if facial_analysis_results:
168
+ for timestamp, emotion in facial_analysis_results:
169
+ st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
170
+ else:
171
+ st.write("No faces detected or video processing failed.")
172
+
173
+ finally:
174
+ # --- THIS IS THE FINAL PART of the FIX for Error 3 ---
175
+ # Clean up the temporary video file in a finally block to ensure it runs
176
+ if temp_video_path and os.path.exists(temp_video_path):
177
+ os.unlink(temp_video_path)