ensemble-tts-annotation / scripts /data /download_ptbr_datasets.py
marcosremar
Implement OPTION A: Optimized 3-model ensemble with fine-tuning
edb9bc2
"""
Download and prepare Portuguese BR emotion datasets (VERBO + emoUERJ).
This script helps download and format the datasets for fine-tuning.
"""
import logging
import argparse
from pathlib import Path
import json
import requests
from datasets import Dataset, Audio
import pandas as pd
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def download_verbo():
"""
Download VERBO dataset.
VERBO: Brazilian Portuguese emotional speech corpus
- 1,167 samples
- 7 emotions: neutral, happy, sad, angry, fearful, disgusted, surprised
- Paper: "VERBO: A Corpus for Emotion Recognition in Brazilian Portuguese"
- Source: http://www02.smt.ufrj.br/~verbo/
Note: VERBO may require manual download or authorization.
"""
logger.info("\nπŸ“₯ VERBO Dataset")
logger.info("=" * 60)
logger.info("Dataset: VERBO - Brazilian Portuguese Emotional Speech")
logger.info("Samples: 1,167")
logger.info("Emotions: 7 (neutral, happy, sad, angry, fearful, disgusted, surprised)")
logger.info("\n⚠️ Manual download required:")
logger.info("1. Visit: http://www02.smt.ufrj.br/~verbo/")
logger.info("2. Request access to the dataset")
logger.info("3. Download and extract to: data/raw/verbo/")
logger.info("\nExpected structure:")
logger.info(" data/raw/verbo/")
logger.info(" β”œβ”€β”€ neutral/")
logger.info(" β”œβ”€β”€ happy/")
logger.info(" β”œβ”€β”€ sad/")
logger.info(" β”œβ”€β”€ angry/")
logger.info(" β”œβ”€β”€ fearful/")
logger.info(" β”œβ”€β”€ disgusted/")
logger.info(" └── surprised/")
def download_emouej():
"""
Download emoUERJ dataset.
emoUERJ: Brazilian Portuguese emotional speech dataset
- 377 samples
- 4 emotions: neutral, happy, sad, angry
- Paper: "emoUERJ: A Deep Learning-Based Emotion Classifier for Brazilian Portuguese"
- Source: UERJ (State University of Rio de Janeiro)
Note: emoUERJ may require manual download or authorization.
"""
logger.info("\nπŸ“₯ emoUERJ Dataset")
logger.info("=" * 60)
logger.info("Dataset: emoUERJ - Brazilian Portuguese Emotional Speech")
logger.info("Samples: 377")
logger.info("Emotions: 4 (neutral, happy, sad, angry)")
logger.info("\n⚠️ Manual download required:")
logger.info("1. Contact UERJ researchers or check university repository")
logger.info("2. Download and extract to: data/raw/emouej/")
logger.info("\nExpected structure:")
logger.info(" data/raw/emouej/")
logger.info(" β”œβ”€β”€ neutral/")
logger.info(" β”œβ”€β”€ happy/")
logger.info(" β”œβ”€β”€ sad/")
logger.info(" └── angry/")
def download_coraa_ser():
"""
Download CORAA-SER dataset.
CORAA-SER: Brazilian Portuguese Speech Emotion Recognition subset
- Part of CORAA corpus (290 hours total)
- Prosodic annotations available
- Multiple speakers, spontaneous speech
- Source: https://github.com/nilc-nlp/CORAA
Note: CORAA is large. Download only the emotion/prosody subset if possible.
"""
logger.info("\nπŸ“₯ CORAA-SER Dataset")
logger.info("=" * 60)
logger.info("Dataset: CORAA - Brazilian Portuguese Speech Corpus")
logger.info("Total: 290 hours")
logger.info("Annotations: Prosodic features (intonation, stress, rhythm)")
logger.info("\n⚠️ Large dataset - manual download recommended:")
logger.info("1. Visit: https://github.com/nilc-nlp/CORAA")
logger.info("2. Download emotion/prosody subset if available")
logger.info("3. Extract to: data/raw/coraa/")
logger.info("\nπŸ’‘ Note: CORAA has prosodic annotations but limited emotion labels")
logger.info(" Use primarily for prosody modeling, not emotion classification")
def prepare_local_dataset(data_dir: Path, emotion_folders: list):
"""
Prepare a local dataset from folder structure.
Expected structure:
data_dir/
β”œβ”€β”€ emotion1/
β”‚ β”œβ”€β”€ audio1.wav
β”‚ └── audio2.wav
β”œβ”€β”€ emotion2/
β”‚ └── audio3.wav
"""
logger.info(f"\nπŸ”„ Preparing dataset from: {data_dir}")
if not data_dir.exists():
logger.error(f"❌ Directory not found: {data_dir}")
return None
samples = []
for emotion in emotion_folders:
emotion_dir = data_dir / emotion
if not emotion_dir.exists():
logger.warning(f"⚠️ Emotion folder not found: {emotion_dir}")
continue
# Find all audio files
audio_files = list(emotion_dir.glob("*.wav")) + \
list(emotion_dir.glob("*.mp3")) + \
list(emotion_dir.glob("*.flac"))
logger.info(f" {emotion}: {len(audio_files)} files")
for audio_file in audio_files:
samples.append({
"audio": str(audio_file),
"emotion": emotion,
"file_name": audio_file.name
})
if not samples:
logger.error("❌ No audio files found")
return None
# Create HuggingFace dataset
df = pd.DataFrame(samples)
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
logger.info(f"βœ… Dataset created: {len(dataset)} samples")
return dataset
def main():
parser = argparse.ArgumentParser(description="Download PT-BR emotion datasets")
parser.add_argument("--dataset", type=str, choices=["verbo", "emouej", "coraa", "all"],
default="all", help="Dataset to download")
parser.add_argument("--prepare-local", type=str, default=None,
help="Prepare dataset from local directory")
parser.add_argument("--emotions", type=str, nargs="+",
default=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"],
help="Emotion folders to look for")
parser.add_argument("--output", type=str, default="data/prepared/",
help="Output directory for prepared datasets")
args = parser.parse_args()
logger.info("\n" + "=" * 60)
logger.info("Portuguese BR Emotion Dataset Downloader")
logger.info("=" * 60)
# If preparing local dataset
if args.prepare_local:
data_dir = Path(args.prepare_local)
dataset = prepare_local_dataset(data_dir, args.emotions)
if dataset is not None:
output_path = Path(args.output) / f"{data_dir.name}_prepared"
output_path.mkdir(parents=True, exist_ok=True)
dataset.save_to_disk(str(output_path))
logger.info(f"βœ… Saved to: {output_path}")
return
# Show download instructions
if args.dataset in ["verbo", "all"]:
download_verbo()
if args.dataset in ["emouej", "all"]:
download_emouej()
if args.dataset in ["coraa", "all"]:
download_coraa_ser()
logger.info("\n" + "=" * 60)
logger.info("πŸ“ Next Steps:")
logger.info("=" * 60)
logger.info("1. Download datasets manually from sources above")
logger.info("2. Extract to data/raw/ directory")
logger.info("3. Run this script with --prepare-local to format for training:")
logger.info("\n python scripts/data/download_ptbr_datasets.py \\")
logger.info(" --prepare-local data/raw/verbo/ \\")
logger.info(" --emotions neutral happy sad angry fearful disgusted surprised")
logger.info("\n4. Use prepared dataset for fine-tuning:")
logger.info("\n python scripts/training/finetune_emotion2vec.py")
if __name__ == "__main__":
main()