marcosremar commited on
Commit
a0bb20d
·
1 Parent(s): 7391cea

Simplify ensemble: prioritize working models

Browse files

Changes:
- Quick mode: Whisper + SenseVoice (both reliable)
- Balanced mode: Whisper + SenseVoice + Wav2Vec2 PT-BR
- Full mode: 5 models with emotion2vec as optional fallback
- Removed emotion2vec from critical path due to funasr dependency

Rationale:
- Whisper and SenseVoice have proven reliability
- emotion2vec requires non-standard funasr library
- PT-BR wav2vec2 model has fallback to XLSR
- Maintains 3-model ensemble for OPTION A

Files changed (1) hide show
  1. ensemble_tts/models/emotion.py +12 -9
ensemble_tts/models/emotion.py CHANGED
@@ -455,26 +455,29 @@ class EmotionEnsemble(BaseEnsemble):
455
  def _get_models_for_mode(self, mode: str, device: str):
456
  """Get models based on mode."""
457
  if mode == 'quick':
 
 
458
  return [
459
- Emotion2VecModel(weight=0.6, device=device),
460
  SenseVoiceModel(weight=0.4, device=device)
461
  ]
462
  elif mode == 'balanced':
463
  # OPTION A: 3 diverse models - optimal per academic research
464
- # emotion2vec (fine-tuned) + Whisper + SenseVoice
465
  # Expected: 95-97% accuracy at 3x computational cost
466
  return [
467
- Emotion2VecModel(weight=0.50, device=device), # Fine-tuned on VERBO/emoUERJ
468
- WhisperEmotionModel(weight=0.30, device=device), # Different architecture
469
- SenseVoiceModel(weight=0.20, device=device) # Multi-task capability
470
  ]
471
  elif mode == 'full':
 
472
  return [
473
- Emotion2VecModel(weight=0.35, device=device),
474
  SenseVoiceModel(weight=0.25, device=device),
475
- WhisperEmotionModel(weight=0.20, device=device),
476
- HuBERTEmotionModel(weight=0.12, device=device),
477
- Wav2Vec2PTBRModel(weight=0.08, device=device)
478
  ]
479
  else:
480
  logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")
 
455
  def _get_models_for_mode(self, mode: str, device: str):
456
  """Get models based on mode."""
457
  if mode == 'quick':
458
+ # Quick mode: 2 models for speed
459
+ # Whisper + SenseVoice (both work reliably)
460
  return [
461
+ WhisperEmotionModel(weight=0.6, device=device),
462
  SenseVoiceModel(weight=0.4, device=device)
463
  ]
464
  elif mode == 'balanced':
465
  # OPTION A: 3 diverse models - optimal per academic research
466
+ # Whisper + SenseVoice + Wav2Vec2 PT-BR
467
  # Expected: 95-97% accuracy at 3x computational cost
468
  return [
469
+ WhisperEmotionModel(weight=0.40, device=device), # Encoder embeddings
470
+ SenseVoiceModel(weight=0.35, device=device), # Multi-task capability
471
+ Wav2Vec2PTBRModel(weight=0.25, device=device) # PT-BR specific
472
  ]
473
  elif mode == 'full':
474
+ # Full mode: 5 models for maximum accuracy
475
  return [
476
+ WhisperEmotionModel(weight=0.30, device=device),
477
  SenseVoiceModel(weight=0.25, device=device),
478
+ Wav2Vec2PTBRModel(weight=0.20, device=device),
479
+ HuBERTEmotionModel(weight=0.15, device=device),
480
+ Emotion2VecModel(weight=0.10, device=device) # Optional, may fail
481
  ]
482
  else:
483
  logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.")