docker/docker-compose.yml · Doradus/MiroThinker-v1.0-30B-FP8 at e444553c25fae18a3dc2558665cff82cf17cb893

MiroThinker-v1.0-30B-FP8 / docker /docker-compose.yml

Upload docker/docker-compose.yml with huggingface_hub

be2f435 verified 4 days ago

1.79 kB

	# MiroThinker-v1.0-30B-FP8 Docker Compose
	#
	# Usage (TP=2, recommended):
	# docker compose up
	#
	# Usage (single GPU, not recommended):
	# SINGLE_GPU=1 docker compose up

	services:
	mirothinker:
	image: vllm/vllm-openai:v0.11.2
	ports:
	- "8000:8000"
	environment:
	- NVIDIA_VISIBLE_DEVICES=all
	- HF_HOME=/root/.cache/huggingface
	# Set SINGLE_GPU=1 for single GPU mode (poor performance)
	- SINGLE_GPU=${SINGLE_GPU:-}
	volumes:
	# Cache downloaded models
	- hf_cache:/root/.cache/huggingface
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	shm_size: '16gb'
	command: >
	sh -c '
	if [ -n "$SINGLE_GPU" ]; then
	echo "WARNING: Single GPU mode - expect ~1-2 tok/s, 2K context max";
	python -m vllm.entrypoints.openai.api_server
	--model Doradus/MiroThinker-v1.0-30B-FP8
	--host 0.0.0.0
	--port 8000
	--tensor-parallel-size 1
	--max-model-len 2048
	--max-num-seqs 4
	--gpu-memory-utilization 0.95
	--enforce-eager
	--trust-remote-code;
	else
	python -m vllm.entrypoints.openai.api_server
	--model Doradus/MiroThinker-v1.0-30B-FP8
	--host 0.0.0.0
	--port 8000
	--tensor-parallel-size 2
	--max-model-len 32768
	--gpu-memory-utilization 0.90
	--trust-remote-code
	--enable-chunked-prefill;
	fi
	'
	healthcheck:
	test: ["CMD", "curl", "-sf", "http://localhost:8000/health"]
	interval: 30s
	timeout: 10s
	retries: 3
	start_period: 120s

	volumes:
	hf_cache: