ensemble-tts-annotation / scripts /cloud /skypilot_annotate_orpheus.yaml

marcosremar

🚀 SkyPilot Multi-Cloud GPU Support + Synthetic Data Generation

13e402e 11 days ago

3.15 kB

	# SkyPilot task for annotating complete Orpheus dataset (118k samples)
	# Uses multi-GPU for parallel processing

	name: ensemble-annotate-orpheus

	resources:
	use_spot: true
	accelerators: A100:4 # 4x A100 for parallel annotation
	# Or use cheaper options: L4:8, V100:4

	memory: 64+
	disk_size: 200 # Need space for dataset + annotations

	setup: \|
	set -e

	echo "🔧 Setting up annotation environment..."

	# Install dependencies
	sudo apt-get update -qq
	pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
	pip install --quiet transformers datasets librosa soundfile accelerate
	pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn pyarrow

	# Clone repo
	if [ ! -d "ensemble-tts-annotation" ]; then
	git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
	fi

	cd ensemble-tts-annotation

	echo "✅ Setup complete!"
	nvidia-smi

	run: \|
	cd ensemble-tts-annotation

	GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	echo "🚀 Annotating Orpheus dataset with $GPU_COUNT GPUs"
	echo "================================================"

	# Download Orpheus dataset
	echo "📥 Downloading Orpheus TTS dataset..."
	python -c "
	from datasets import load_dataset
	import os

	print('Loading dataset...')
	dataset = load_dataset('marcosremar2/orpheus-tts-portuguese-dataset', split='train')
	print(f'✓ Loaded {len(dataset)} samples')

	# Save locally for faster access
	os.makedirs('data/raw/orpheus/', exist_ok=True)
	dataset.save_to_disk('data/raw/orpheus/dataset')
	print('✓ Saved locally')
	"

	# Annotate with ensemble (parallel processing)
	echo "🎯 Running ensemble annotation..."
	python scripts/ensemble/annotate_ensemble.py \
	--input data/raw/orpheus/dataset \
	--mode balanced \
	--device cuda \
	--batch-size 32 \
	--num-workers 8 \
	--output data/annotated/orpheus_annotated.parquet

	echo "✅ Annotation complete!"
	echo "================================================"

	# Statistics
	echo "📊 Annotation statistics:"
	python -c "
	import pandas as pd

	df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
	print(f'Total samples: {len(df)}')
	print(f'\nEmotion distribution:')
	print(df['emotion'].value_counts())
	print(f'\nConfidence statistics:')
	print(df['emotion_confidence'].describe())
	"

	# Upload to HuggingFace
	echo "📤 Uploading annotated dataset to HuggingFace..."
	python -c "
	from datasets import Dataset
	import pandas as pd

	df = pd.read_parquet('data/annotated/orpheus_annotated.parquet')
	dataset = Dataset.from_pandas(df)

	# Push to HuggingFace Hub
	dataset.push_to_hub(
	'marcosremar2/orpheus-tts-portuguese-annotated',
	private=False
	)
	print('✓ Uploaded to HuggingFace!')
	"

	echo "================================================"
	echo "✅ Complete! Annotated dataset available at:"
	echo " https://huggingface.co/datasets/marcosremar2/orpheus-tts-portuguese-annotated"

	# File mounts (if dataset is pre-stored in cloud)
	# file_mounts:
	# /data/orpheus:
	# source: gs://my-bucket/orpheus-dataset/
	# mode: MOUNT

	num_nodes: 1