ensemble-tts-annotation / scripts /cloud /skypilot_multi_gpu.yaml

marcosremar

🚀 SkyPilot Multi-Cloud GPU Support + Synthetic Data Generation

13e402e about 1 month ago

2.41 kB

	# SkyPilot Multi-GPU Configuration for Fast Fine-tuning
	# Uses 8x GPUs for parallel training and dataset annotation

	name: ensemble-multi-gpu

	resources:
	use_spot: true
	accelerators: A100:8 # 8x A100 GPUs
	# Alternative cheaper options:
	# accelerators: V100:8 # 8x V100
	# accelerators: L4:8 # 8x L4 (cheaper)

	memory: 128+ # 128GB+ RAM for multi-GPU
	disk_size: 500 # 500GB for datasets

	setup: \|
	set -e

	echo "🔧 Setting up multi-GPU environment..."

	# Install dependencies
	sudo apt-get update -qq
	pip install --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
	pip install --quiet transformers datasets librosa soundfile accelerate
	pip install --quiet huggingface_hub pandas numpy tqdm scikit-learn

	# Clone repo
	if [ ! -d "ensemble-tts-annotation" ]; then
	git clone https://huggingface.co/marcosremar2/ensemble-tts-annotation
	fi

	cd ensemble-tts-annotation

	echo "✅ Setup complete!"
	echo "GPUs available:"
	nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader

	run: \|
	cd ensemble-tts-annotation

	# Check GPU count
	GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	echo "🚀 Multi-GPU Training with $GPU_COUNT GPUs"
	echo "================================================"

	# Create synthetic data
	echo "📊 Creating synthetic dataset (larger for multi-GPU)..."
	python scripts/data/create_synthetic_test_data.py \
	--output data/raw/synthetic_large/ \
	--samples 200

	# Prepare dataset
	echo "📦 Preparing dataset..."
	python scripts/data/download_ptbr_datasets.py \
	--prepare-local data/raw/synthetic_large/

	# Fine-tune with multi-GPU (using accelerate)
	echo "🔥 Fine-tuning with $GPU_COUNT GPUs..."
	accelerate launch --multi_gpu --num_processes=$GPU_COUNT \
	scripts/training/finetune_emotion2vec.py \
	--dataset data/prepared/synthetic_large_prepared \
	--epochs 20 \
	--batch-size 64 \
	--device cuda \
	--augment \
	--output models/emotion/emotion2vec_finetuned_multigpu/

	echo "✅ Fine-tuning complete!"

	# Benchmark
	echo "📊 Performance benchmark:"
	python scripts/test/test_quick.py --mode balanced

	echo "================================================"
	echo "💡 Upload results with:"
	echo "sky storage upload models/emotion/emotion2vec_finetuned_multigpu/ s3://my-bucket/"

	num_nodes: 1