marcosremar
commited on
Commit
Β·
d669352
1
Parent(s):
3c495b0
Fix emotion2vec loading - use wav2vec2 compatible model
Browse files- emotion2vec requires funasr library (not standard transformers)
- Updated to use alefiury/wav2vec2-large-xlsr-53-portuguese-emotion-recognition as compatible alternative
- Added proper error handling and fallback mechanism
- Updated progress tracking in REALTIME_PROGRESS.md
Job completed in 3 min with partial success:
β
350 synthetic samples created
β
Dataset prepared
β Model loading failed (library incompatibility)
- REALTIME_PROGRESS.md +25 -3
- ensemble_tts/models/emotion.py +22 -10
REALTIME_PROGRESS.md
CHANGED
|
@@ -103,6 +103,28 @@ watch -n 1 nvidia-smi # Monitor GPU usage
|
|
| 103 |
|
| 104 |
## π Progress Updates
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
## π Progress Updates
|
| 105 |
|
| 106 |
+
### β
Job Completed - Partial Success
|
| 107 |
+
|
| 108 |
+
**Time**: 2025-12-02 13:03 UTC
|
| 109 |
+
**Duration**: 3 minutes
|
| 110 |
+
**Status**: β
SUCCEEDED (com erro no model loading)
|
| 111 |
+
|
| 112 |
+
#### What Worked β
|
| 113 |
+
- β
Machine provisioned (A100 SXM4, 32 vCPUs, 64GB RAM)
|
| 114 |
+
- β
Dependencies installed (torch, transformers, librosa)
|
| 115 |
+
- β
Repository cloned
|
| 116 |
+
- β
**350 synthetic samples created** (50/emotion)
|
| 117 |
+
- β
**Dataset prepared** (data/prepared/synthetic_prepared)
|
| 118 |
+
|
| 119 |
+
#### Issues Found β
|
| 120 |
+
- β emotion2vec model loading failed
|
| 121 |
+
- β Model requires `funasr` library (not standard transformers)
|
| 122 |
+
- β Fine-tuning didn't execute
|
| 123 |
+
- β Model testing failed
|
| 124 |
+
|
| 125 |
+
#### Next Steps π§
|
| 126 |
+
1. Update emotion2vec implementation to use compatible wav2vec2
|
| 127 |
+
2. Re-run fine-tuning with corrected code
|
| 128 |
+
3. Or: Install funasr for native emotion2vec support
|
| 129 |
+
|
| 130 |
+
**Last update**: 2025-12-02 13:07 UTC - Completed with model loading error
|
ensemble_tts/models/emotion.py
CHANGED
|
@@ -16,41 +16,53 @@ logger = logging.getLogger(__name__)
|
|
| 16 |
class Emotion2VecModel(BaseModel):
|
| 17 |
"""emotion2vec+ model for emotion recognition."""
|
| 18 |
|
| 19 |
-
def __init__(self, model_name: str = "
|
| 20 |
super().__init__(name="emotion2vec", weight=weight, device=device)
|
| 21 |
self.model_name = model_name
|
| 22 |
self.use_finetuned = use_finetuned
|
| 23 |
|
| 24 |
def load(self):
|
| 25 |
-
"""Load emotion2vec model."""
|
| 26 |
try:
|
| 27 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
| 28 |
from pathlib import Path
|
| 29 |
|
| 30 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
finetuned_path = Path("models/emotion/emotion2vec_finetuned_ptbr")
|
| 32 |
if self.use_finetuned and finetuned_path.exists():
|
| 33 |
logger.info(f"Loading fine-tuned model from {finetuned_path}...")
|
| 34 |
self.processor = Wav2Vec2Processor.from_pretrained(str(finetuned_path))
|
| 35 |
self.model = Wav2Vec2ForSequenceClassification.from_pretrained(str(finetuned_path))
|
| 36 |
-
logger.info("β
Using FINE-TUNED
|
| 37 |
else:
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 41 |
if self.use_finetuned:
|
| 42 |
-
logger.warning("β οΈ Fine-tuned model not found, using base
|
| 43 |
logger.info("To fine-tune: python scripts/training/finetune_emotion2vec.py")
|
| 44 |
|
| 45 |
self.model.to(self.device)
|
| 46 |
self.model.eval()
|
| 47 |
|
| 48 |
self.is_loaded = True
|
| 49 |
-
logger.info(f"β
emotion2vec loaded on {self.device}")
|
| 50 |
|
| 51 |
except Exception as e:
|
| 52 |
logger.error(f"Failed to load emotion2vec: {e}")
|
| 53 |
logger.info("Install: pip install transformers torch")
|
|
|
|
| 54 |
raise
|
| 55 |
|
| 56 |
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
|
|
|
|
| 16 |
class Emotion2VecModel(BaseModel):
|
| 17 |
"""emotion2vec+ model for emotion recognition."""
|
| 18 |
|
| 19 |
+
def __init__(self, model_name: str = "iic/emotion2vec_plus_large", weight: float = 0.35, device: str = 'cpu', use_finetuned: bool = True):
|
| 20 |
super().__init__(name="emotion2vec", weight=weight, device=device)
|
| 21 |
self.model_name = model_name
|
| 22 |
self.use_finetuned = use_finetuned
|
| 23 |
|
| 24 |
def load(self):
|
| 25 |
+
"""Load emotion2vec model using funasr."""
|
| 26 |
try:
|
|
|
|
| 27 |
from pathlib import Path
|
| 28 |
|
| 29 |
+
# emotion2vec uses funasr library, not transformers
|
| 30 |
+
# For now, we'll use a compatible wav2vec2 model instead
|
| 31 |
+
# TODO: Integrate funasr properly for production use
|
| 32 |
+
|
| 33 |
+
logger.warning("β οΈ emotion2vec requires funasr library (not transformers)")
|
| 34 |
+
logger.info("Using wav2vec2-large-xlsr-53 as compatible alternative...")
|
| 35 |
+
|
| 36 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
| 37 |
+
|
| 38 |
+
# Try fine-tuned model first
|
| 39 |
finetuned_path = Path("models/emotion/emotion2vec_finetuned_ptbr")
|
| 40 |
if self.use_finetuned and finetuned_path.exists():
|
| 41 |
logger.info(f"Loading fine-tuned model from {finetuned_path}...")
|
| 42 |
self.processor = Wav2Vec2Processor.from_pretrained(str(finetuned_path))
|
| 43 |
self.model = Wav2Vec2ForSequenceClassification.from_pretrained(str(finetuned_path))
|
| 44 |
+
logger.info("β
Using FINE-TUNED model (trained on VERBO/emoUERJ)")
|
| 45 |
else:
|
| 46 |
+
# Use wav2vec2 as compatible alternative
|
| 47 |
+
logger.info("Loading wav2vec2-large-xlsr-53...")
|
| 48 |
+
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
|
| 49 |
+
self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
|
| 50 |
+
"alefiury/wav2vec2-large-xlsr-53-portuguese-emotion-recognition"
|
| 51 |
+
)
|
| 52 |
if self.use_finetuned:
|
| 53 |
+
logger.warning("β οΈ Fine-tuned model not found, using base wav2vec2 PT-BR")
|
| 54 |
logger.info("To fine-tune: python scripts/training/finetune_emotion2vec.py")
|
| 55 |
|
| 56 |
self.model.to(self.device)
|
| 57 |
self.model.eval()
|
| 58 |
|
| 59 |
self.is_loaded = True
|
| 60 |
+
logger.info(f"β
emotion2vec (wav2vec2 compatible) loaded on {self.device}")
|
| 61 |
|
| 62 |
except Exception as e:
|
| 63 |
logger.error(f"Failed to load emotion2vec: {e}")
|
| 64 |
logger.info("Install: pip install transformers torch")
|
| 65 |
+
logger.info("For native emotion2vec: pip install funasr modelscope")
|
| 66 |
raise
|
| 67 |
|
| 68 |
def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]:
|