ensemble-tts-annotation / scripts /cloud /skypilot_annotate_orpheus.yaml
marcosremar
πŸš€ SkyPilot Multi-Cloud GPU Support + Synthetic Data Generation
13e402e
# SkyPilot task for annotating complete Orpheus dataset (118k samples)
# Uses multi-GPU for parallel processing
name: ensemble-annotate-orpheus
resources:
use_spot: true
accelerators: A100:4 # 4x A100 for parallel annotation
# Or use cheaper options: L4:8, V100:4
memory: 64+
disk_size: 200 # Need space for dataset + annotations
setup: |
set -e
echo "πŸ”§ Setting up annotation environment..."
# Install dependencies
sudo apt-get update -qq
pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install --quiet transformers datasets librosa soundfile accelerate
pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow
# Clone repo
if [ ! -d "ensemble-tts-annotation" ]; then
git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
fi
cd ensemble-tts-annotation
echo "βœ… Setup complete!"
nvidia-smi
run: |
cd ensemble-tts-annotation
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "πŸš€ Annotating Orpheus dataset with $GPU_COUNT GPUs"
echo "================================================"
# Download Orpheus dataset
echo "πŸ“₯ Downloading Orpheus TTS dataset..."
python -c "
from datasets import load_dataset
import os
print('Loading dataset...')
dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')
print(f'βœ“ Loaded {len(dataset)} samples')
# Save locally for faster access
os.makedirs('data/raw/orpheus/', exist_ok=True)
dataset.save_to_disk('data/raw/orpheus/dataset')
print('βœ“ Saved locally')
"
# Annotate with ensemble (parallel processing)
echo "🎯 Running ensemble annotation..."
python scripts/ensemble/annotate_ensemble.py \
--input data/raw/orpheus/dataset \
--mode balanced \
--device cuda \
--batch-size 32 \
--num-workers 8 \
--output data/annotated/orpheus_annotated.parquet
echo "βœ… Annotation complete!"
echo "================================================"
# Statistics
echo "πŸ“Š Annotation statistics:"
python -c "
import pandas as pd
df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
print(f'Total samples: {len(df)}')
print(f'\nEmotion distribution:')
print(df['emotion'].value_counts())
print(f'\nConfidence statistics:')
print(df['emotion_confidence'].describe())
"
# Upload to HuggingFace
echo "πŸ“€ Uploading annotated dataset to HuggingFace..."
python -c "
from datasets import Dataset
import pandas as pd
df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
dataset = Dataset.from_pandas(df)
# Push to HuggingFace Hub
dataset.push_to_hub(
'marcosremar2/orpheus-tts-portuguese-annotated',
private=False
)
print('βœ“ Uploaded to HuggingFace!')
"
echo "================================================"
echo "βœ… Complete! Annotated dataset available at:"
echo " https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated"
# File mounts (if dataset is pre-stored in cloud)
# file_mounts:
# /data/orpheus:
# source: gs://my-bucket/orpheus-dataset/
# mode: MOUNT
num_nodes: 1