|
|
""" |
|
|
Download and prepare Portuguese BR emotion datasets (VERBO + emoUERJ). |
|
|
|
|
|
This script helps download and format the datasets for fine-tuning. |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
import json |
|
|
import requests |
|
|
from datasets import Dataset, Audio |
|
|
import pandas as pd |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def download_verbo(): |
|
|
""" |
|
|
Download VERBO dataset. |
|
|
|
|
|
VERBO: Brazilian Portuguese emotional speech corpus |
|
|
- 1,167 samples |
|
|
- 7 emotions: neutral, happy, sad, angry, fearful, disgusted, surprised |
|
|
- Paper: "VERBO: A Corpus for Emotion Recognition in Brazilian Portuguese" |
|
|
- Source: http://www02.smt.ufrj.br/~verbo/ |
|
|
|
|
|
Note: VERBO may require manual download or authorization. |
|
|
""" |
|
|
logger.info("\nπ₯ VERBO Dataset") |
|
|
logger.info("=" * 60) |
|
|
logger.info("Dataset: VERBO - Brazilian Portuguese Emotional Speech") |
|
|
logger.info("Samples: 1,167") |
|
|
logger.info("Emotions: 7 (neutral, happy, sad, angry, fearful, disgusted, surprised)") |
|
|
logger.info("\nβ οΈ Manual download required:") |
|
|
logger.info("1. Visit: http://www02.smt.ufrj.br/~verbo/") |
|
|
logger.info("2. Request access to the dataset") |
|
|
logger.info("3. Download and extract to: data/raw/verbo/") |
|
|
logger.info("\nExpected structure:") |
|
|
logger.info(" data/raw/verbo/") |
|
|
logger.info(" βββ neutral/") |
|
|
logger.info(" βββ happy/") |
|
|
logger.info(" βββ sad/") |
|
|
logger.info(" βββ angry/") |
|
|
logger.info(" βββ fearful/") |
|
|
logger.info(" βββ disgusted/") |
|
|
logger.info(" βββ surprised/") |
|
|
|
|
|
|
|
|
def download_emouej(): |
|
|
""" |
|
|
Download emoUERJ dataset. |
|
|
|
|
|
emoUERJ: Brazilian Portuguese emotional speech dataset |
|
|
- 377 samples |
|
|
- 4 emotions: neutral, happy, sad, angry |
|
|
- Paper: "emoUERJ: A Deep Learning-Based Emotion Classifier for Brazilian Portuguese" |
|
|
- Source: UERJ (State University of Rio de Janeiro) |
|
|
|
|
|
Note: emoUERJ may require manual download or authorization. |
|
|
""" |
|
|
logger.info("\nπ₯ emoUERJ Dataset") |
|
|
logger.info("=" * 60) |
|
|
logger.info("Dataset: emoUERJ - Brazilian Portuguese Emotional Speech") |
|
|
logger.info("Samples: 377") |
|
|
logger.info("Emotions: 4 (neutral, happy, sad, angry)") |
|
|
logger.info("\nβ οΈ Manual download required:") |
|
|
logger.info("1. Contact UERJ researchers or check university repository") |
|
|
logger.info("2. Download and extract to: data/raw/emouej/") |
|
|
logger.info("\nExpected structure:") |
|
|
logger.info(" data/raw/emouej/") |
|
|
logger.info(" βββ neutral/") |
|
|
logger.info(" βββ happy/") |
|
|
logger.info(" βββ sad/") |
|
|
logger.info(" βββ angry/") |
|
|
|
|
|
|
|
|
def download_coraa_ser(): |
|
|
""" |
|
|
Download CORAA-SER dataset. |
|
|
|
|
|
CORAA-SER: Brazilian Portuguese Speech Emotion Recognition subset |
|
|
- Part of CORAA corpus (290 hours total) |
|
|
- Prosodic annotations available |
|
|
- Multiple speakers, spontaneous speech |
|
|
- Source: https://github.com/nilc-nlp/CORAA |
|
|
|
|
|
Note: CORAA is large. Download only the emotion/prosody subset if possible. |
|
|
""" |
|
|
logger.info("\nπ₯ CORAA-SER Dataset") |
|
|
logger.info("=" * 60) |
|
|
logger.info("Dataset: CORAA - Brazilian Portuguese Speech Corpus") |
|
|
logger.info("Total: 290 hours") |
|
|
logger.info("Annotations: Prosodic features (intonation, stress, rhythm)") |
|
|
logger.info("\nβ οΈ Large dataset - manual download recommended:") |
|
|
logger.info("1. Visit: https://github.com/nilc-nlp/CORAA") |
|
|
logger.info("2. Download emotion/prosody subset if available") |
|
|
logger.info("3. Extract to: data/raw/coraa/") |
|
|
logger.info("\nπ‘ Note: CORAA has prosodic annotations but limited emotion labels") |
|
|
logger.info(" Use primarily for prosody modeling, not emotion classification") |
|
|
|
|
|
|
|
|
def prepare_local_dataset(data_dir: Path, emotion_folders: list): |
|
|
""" |
|
|
Prepare a local dataset from folder structure. |
|
|
|
|
|
Expected structure: |
|
|
data_dir/ |
|
|
βββ emotion1/ |
|
|
β βββ audio1.wav |
|
|
β βββ audio2.wav |
|
|
βββ emotion2/ |
|
|
β βββ audio3.wav |
|
|
""" |
|
|
logger.info(f"\nπ Preparing dataset from: {data_dir}") |
|
|
|
|
|
if not data_dir.exists(): |
|
|
logger.error(f"β Directory not found: {data_dir}") |
|
|
return None |
|
|
|
|
|
samples = [] |
|
|
for emotion in emotion_folders: |
|
|
emotion_dir = data_dir / emotion |
|
|
if not emotion_dir.exists(): |
|
|
logger.warning(f"β οΈ Emotion folder not found: {emotion_dir}") |
|
|
continue |
|
|
|
|
|
|
|
|
audio_files = list(emotion_dir.glob("*.wav")) + \ |
|
|
list(emotion_dir.glob("*.mp3")) + \ |
|
|
list(emotion_dir.glob("*.flac")) |
|
|
|
|
|
logger.info(f" {emotion}: {len(audio_files)} files") |
|
|
|
|
|
for audio_file in audio_files: |
|
|
samples.append({ |
|
|
"audio": str(audio_file), |
|
|
"emotion": emotion, |
|
|
"file_name": audio_file.name |
|
|
}) |
|
|
|
|
|
if not samples: |
|
|
logger.error("β No audio files found") |
|
|
return None |
|
|
|
|
|
|
|
|
df = pd.DataFrame(samples) |
|
|
dataset = Dataset.from_pandas(df) |
|
|
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
|
|
|
|
logger.info(f"β
Dataset created: {len(dataset)} samples") |
|
|
return dataset |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Download PT-BR emotion datasets") |
|
|
parser.add_argument("--dataset", type=str, choices=["verbo", "emouej", "coraa", "all"], |
|
|
default="all", help="Dataset to download") |
|
|
parser.add_argument("--prepare-local", type=str, default=None, |
|
|
help="Prepare dataset from local directory") |
|
|
parser.add_argument("--emotions", type=str, nargs="+", |
|
|
default=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"], |
|
|
help="Emotion folders to look for") |
|
|
parser.add_argument("--output", type=str, default="data/prepared/", |
|
|
help="Output directory for prepared datasets") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("Portuguese BR Emotion Dataset Downloader") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
|
|
|
if args.prepare_local: |
|
|
data_dir = Path(args.prepare_local) |
|
|
dataset = prepare_local_dataset(data_dir, args.emotions) |
|
|
|
|
|
if dataset is not None: |
|
|
output_path = Path(args.output) / f"{data_dir.name}_prepared" |
|
|
output_path.mkdir(parents=True, exist_ok=True) |
|
|
dataset.save_to_disk(str(output_path)) |
|
|
logger.info(f"β
Saved to: {output_path}") |
|
|
|
|
|
return |
|
|
|
|
|
|
|
|
if args.dataset in ["verbo", "all"]: |
|
|
download_verbo() |
|
|
|
|
|
if args.dataset in ["emouej", "all"]: |
|
|
download_emouej() |
|
|
|
|
|
if args.dataset in ["coraa", "all"]: |
|
|
download_coraa_ser() |
|
|
|
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("π Next Steps:") |
|
|
logger.info("=" * 60) |
|
|
logger.info("1. Download datasets manually from sources above") |
|
|
logger.info("2. Extract to data/raw/ directory") |
|
|
logger.info("3. Run this script with --prepare-local to format for training:") |
|
|
logger.info("\n python scripts/data/download_ptbr_datasets.py \\") |
|
|
logger.info(" --prepare-local data/raw/verbo/ \\") |
|
|
logger.info(" --emotions neutral happy sad angry fearful disgusted surprised") |
|
|
logger.info("\n4. Use prepared dataset for fine-tuning:") |
|
|
logger.info("\n python scripts/training/finetune_emotion2vec.py") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|