|
|
|
|
|
|
|
|
|
|
|
name: ensemble-annotate-orpheus |
|
|
|
|
|
resources: |
|
|
use_spot: true |
|
|
accelerators: A100:4 |
|
|
|
|
|
|
|
|
memory: 64+ |
|
|
disk_size: 200 |
|
|
|
|
|
setup: | |
|
|
set -e |
|
|
|
|
|
echo "π§ Setting up annotation environment..." |
|
|
|
|
|
|
|
|
sudo apt-get update -qq |
|
|
pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 |
|
|
pip install --quiet transformers datasets librosa soundfile accelerate |
|
|
pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow |
|
|
|
|
|
|
|
|
if [ ! -d "ensemble-tts-annotation" ]; then |
|
|
git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation |
|
|
fi |
|
|
|
|
|
cd ensemble-tts-annotation |
|
|
|
|
|
echo "β
Setup complete!" |
|
|
nvidia-smi |
|
|
|
|
|
run: | |
|
|
cd ensemble-tts-annotation |
|
|
|
|
|
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) |
|
|
echo "π Annotating Orpheus dataset with $GPU_COUNT GPUs" |
|
|
echo "================================================" |
|
|
|
|
|
|
|
|
echo "π₯ Downloading Orpheus TTS dataset..." |
|
|
python -c " |
|
|
from datasets import load_dataset |
|
|
import os |
|
|
|
|
|
print('Loading dataset...') |
|
|
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train') |
|
|
print(f'β Loaded {len(dataset)} samples') |
|
|
|
|
|
# Save locally for faster access |
|
|
os.makedirs('data/raw/orpheus/', exist_ok=True) |
|
|
dataset.save_to_disk('data/raw/orpheus/dataset') |
|
|
print('β Saved locally') |
|
|
" |
|
|
|
|
|
|
|
|
echo "π― Running ensemble annotation..." |
|
|
python scripts/ensemble/annotate_ensemble.py \ |
|
|
--input data/raw/orpheus/dataset \ |
|
|
--mode balanced \ |
|
|
--device cuda \ |
|
|
--batch-size 32 \ |
|
|
--num-workers 8 \ |
|
|
--output data/annotated/orpheus_annotated.parquet |
|
|
|
|
|
echo "β
Annotation complete!" |
|
|
echo "================================================" |
|
|
|
|
|
|
|
|
echo "π Annotation statistics:" |
|
|
python -c " |
|
|
import pandas as pd |
|
|
|
|
|
df = pd.read_parquet('data/annotated/orpheus_annotated.parquet') |
|
|
print(f'Total samples: {len(df)}') |
|
|
print(f'\nEmotion distribution:') |
|
|
print(df['emotion'].value_counts()) |
|
|
print(f'\nConfidence statistics:') |
|
|
print(df['emotion_confidence'].describe()) |
|
|
" |
|
|
|
|
|
|
|
|
echo "π€ Uploading annotated dataset to HuggingFace..." |
|
|
python -c " |
|
|
from datasets import Dataset |
|
|
import pandas as pd |
|
|
|
|
|
df = pd.read_parquet('data/annotated/orpheus_annotated.parquet') |
|
|
dataset = Dataset.from_pandas(df) |
|
|
|
|
|
# Push to HuggingFace Hub |
|
|
dataset.push_to_hub( |
|
|
'marcosremar2/orpheus-tts-portuguese-annotated', |
|
|
private=False |
|
|
) |
|
|
print('β Uploaded to HuggingFace!') |
|
|
" |
|
|
|
|
|
echo "================================================" |
|
|
echo "β
Complete! Annotated dataset available at:" |
|
|
echo " https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
num_nodes: 1 |
|
|
|