marcosremar
commited on
Commit
·
a0bb20d
1
Parent(s):
7391cea
Simplify ensemble: prioritize working models
Browse filesChanges:
- Quick mode: Whisper + SenseVoice (both reliable)
- Balanced mode: Whisper + SenseVoice + Wav2Vec2 PT-BR
- Full mode: 5 models with emotion2vec as optional fallback
- Removed emotion2vec from critical path due to funasr dependency
Rationale:
- Whisper and SenseVoice have proven reliability
- emotion2vec requires non-standard funasr library
- PT-BR wav2vec2 model has fallback to XLSR
- Maintains 3-model ensemble for OPTION A
ensemble_tts/models/emotion.py
CHANGED
|
@@ -455,26 +455,29 @@ class EmotionEnsemble(BaseEnsemble):
|
|
| 455 |
def _get_models_for_mode(self, mode: str, device: str):
|
| 456 |
"""Get models based on mode."""
|
| 457 |
if mode == 'quick':
|
|
|
|
|
|
|
| 458 |
return [
|
| 459 |
-
|
| 460 |
SenseVoiceModel(weight=0.4, device=device)
|
| 461 |
]
|
| 462 |
elif mode == 'balanced':
|
| 463 |
# OPTION A: 3 diverse models - optimal per academic research
|
| 464 |
-
#
|
| 465 |
# Expected: 95-97% accuracy at 3x computational cost
|
| 466 |
return [
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
]
|
| 471 |
elif mode == 'full':
|
|
|
|
| 472 |
return [
|
| 473 |
-
|
| 474 |
SenseVoiceModel(weight=0.25, device=device),
|
| 475 |
-
|
| 476 |
-
HuBERTEmotionModel(weight=0.
|
| 477 |
-
|
| 478 |
]
|
| 479 |
else:
|
| 480 |
logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")
|
|
|
|
| 455 |
def _get_models_for_mode(self, mode: str, device: str):
|
| 456 |
"""Get models based on mode."""
|
| 457 |
if mode == 'quick':
|
| 458 |
+
# Quick mode: 2 models for speed
|
| 459 |
+
# Whisper + SenseVoice (both work reliably)
|
| 460 |
return [
|
| 461 |
+
WhisperEmotionModel(weight=0.6, device=device),
|
| 462 |
SenseVoiceModel(weight=0.4, device=device)
|
| 463 |
]
|
| 464 |
elif mode == 'balanced':
|
| 465 |
# OPTION A: 3 diverse models - optimal per academic research
|
| 466 |
+
# Whisper + SenseVoice + Wav2Vec2 PT-BR
|
| 467 |
# Expected: 95-97% accuracy at 3x computational cost
|
| 468 |
return [
|
| 469 |
+
WhisperEmotionModel(weight=0.40, device=device), # Encoder embeddings
|
| 470 |
+
SenseVoiceModel(weight=0.35, device=device), # Multi-task capability
|
| 471 |
+
Wav2Vec2PTBRModel(weight=0.25, device=device) # PT-BR specific
|
| 472 |
]
|
| 473 |
elif mode == 'full':
|
| 474 |
+
# Full mode: 5 models for maximum accuracy
|
| 475 |
return [
|
| 476 |
+
WhisperEmotionModel(weight=0.30, device=device),
|
| 477 |
SenseVoiceModel(weight=0.25, device=device),
|
| 478 |
+
Wav2Vec2PTBRModel(weight=0.20, device=device),
|
| 479 |
+
HuBERTEmotionModel(weight=0.15, device=device),
|
| 480 |
+
Emotion2VecModel(weight=0.10, device=device) # Optional, may fail
|
| 481 |
]
|
| 482 |
else:
|
| 483 |
logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")
|