Simplify ensemble: prioritize working models

Changes:
- Quick mode: Whisper + SenseVoice (both reliable)
- Balanced mode: Whisper + SenseVoice + Wav2Vec2 PT-BR
- Full mode: 5 models with emotion2vec as optional fallback
- Removed emotion2vec from critical path due to funasr dependency

Rationale:
- Whisper and SenseVoice have proven reliability
- emotion2vec requires non-standard funasr library
- PT-BR wav2vec2 model has fallback to XLSR
- Maintains 3-model ensemble for OPTION A

Files changed (1) hide show

ensemble_tts/models/emotion.py +12 -9

ensemble_tts/models/emotion.py CHANGED Viewed

@@ -455,26 +455,29 @@ class EmotionEnsemble(BaseEnsemble):
     def _get_models_for_mode(self, mode: str, device: str):
         """Get models based on mode."""
         if mode == 'quick':
             return [
-                Emotion2VecModel(weight=0.6, device=device),
                 SenseVoiceModel(weight=0.4, device=device)
             ]
         elif mode == 'balanced':
             # OPTION A: 3 diverse models - optimal per academic research
-            # emotion2vec (fine-tuned) + Whisper + SenseVoice
             # Expected: 95-97% accuracy at 3x computational cost
             return [
-                Emotion2VecModel(weight=0.50, device=device),  # Fine-tuned on VERBO/emoUERJ
-                WhisperEmotionModel(weight=0.30, device=device),  # Different architecture
-                SenseVoiceModel(weight=0.20, device=device)  # Multi-task capability
             ]
         elif mode == 'full':
             return [
-                Emotion2VecModel(weight=0.35, device=device),
                 SenseVoiceModel(weight=0.25, device=device),
-                WhisperEmotionModel(weight=0.20, device=device),
-                HuBERTEmotionModel(weight=0.12, device=device),
-                Wav2Vec2PTBRModel(weight=0.08, device=device)
             ]
         else:
             logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")

     def _get_models_for_mode(self, mode: str, device: str):
         """Get models based on mode."""
         if mode == 'quick':
+            # Quick mode: 2 models for speed
+            # Whisper + SenseVoice (both work reliably)
             return [
+                WhisperEmotionModel(weight=0.6, device=device),
                 SenseVoiceModel(weight=0.4, device=device)
             ]
         elif mode == 'balanced':
             # OPTION A: 3 diverse models - optimal per academic research
+            # Whisper + SenseVoice + Wav2Vec2 PT-BR
             # Expected: 95-97% accuracy at 3x computational cost
             return [
+                WhisperEmotionModel(weight=0.40, device=device),  # Encoder embeddings
+                SenseVoiceModel(weight=0.35, device=device),  # Multi-task capability
+                Wav2Vec2PTBRModel(weight=0.25, device=device)  # PT-BR specific
             ]
         elif mode == 'full':
+            # Full mode: 5 models for maximum accuracy
             return [
+                WhisperEmotionModel(weight=0.30, device=device),
                 SenseVoiceModel(weight=0.25, device=device),
+                Wav2Vec2PTBRModel(weight=0.20, device=device),
+                HuBERTEmotionModel(weight=0.15, device=device),
+                Emotion2VecModel(weight=0.10, device=device)  # Optional, may fail
             ]
         else:
             logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")