File size: 1,790 Bytes
be2f435 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# MiroThinker-v1.0-30B-FP8 Docker Compose
#
# Usage (TP=2, recommended):
# docker compose up
#
# Usage (single GPU, not recommended):
# SINGLE_GPU=1 docker compose up
services:
mirothinker:
image: vllm/vllm-openai:v0.11.2
ports:
- "8000:8000"
environment:
- NVIDIA_VISIBLE_DEVICES=all
- HF_HOME=/root/.cache/huggingface
# Set SINGLE_GPU=1 for single GPU mode (poor performance)
- SINGLE_GPU=${SINGLE_GPU:-}
volumes:
# Cache downloaded models
- hf_cache:/root/.cache/huggingface
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
shm_size: '16gb'
command: >
sh -c '
if [ -n "$SINGLE_GPU" ]; then
echo "WARNING: Single GPU mode - expect ~1-2 tok/s, 2K context max";
python -m vllm.entrypoints.openai.api_server
--model Doradus/MiroThinker-v1.0-30B-FP8
--host 0.0.0.0
--port 8000
--tensor-parallel-size 1
--max-model-len 2048
--max-num-seqs 4
--gpu-memory-utilization 0.95
--enforce-eager
--trust-remote-code;
else
python -m vllm.entrypoints.openai.api_server
--model Doradus/MiroThinker-v1.0-30B-FP8
--host 0.0.0.0
--port 8000
--tensor-parallel-size 2
--max-model-len 32768
--gpu-memory-utilization 0.90
--trust-remote-code
--enable-chunked-prefill;
fi
'
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 120s
volumes:
hf_cache:
|