| # MiroThinker-v1.0-30B-FP8 Docker Compose | |
| # | |
| # Usage (TP=2, recommended): | |
| # docker compose up | |
| # | |
| # Usage (single GPU, not recommended): | |
| # SINGLE_GPU=1 docker compose up | |
| services: | |
| mirothinker: | |
| image: vllm/vllm-openai:v0.11.2 | |
| ports: | |
| - "8000:8000" | |
| environment: | |
| - NVIDIA_VISIBLE_DEVICES=all | |
| - HF_HOME=/root/.cache/huggingface | |
| # Set SINGLE_GPU=1 for single GPU mode (poor performance) | |
| - SINGLE_GPU=${SINGLE_GPU:-} | |
| volumes: | |
| # Cache downloaded models | |
| - hf_cache:/root/.cache/huggingface | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| shm_size: '16gb' | |
| command: > | |
| sh -c ' | |
| if [ -n "$SINGLE_GPU" ]; then | |
| echo "WARNING: Single GPU mode - expect ~1-2 tok/s, 2K context max"; | |
| python -m vllm.entrypoints.openai.api_server | |
| --model Doradus/MiroThinker-v1.0-30B-FP8 | |
| --host 0.0.0.0 | |
| --port 8000 | |
| --tensor-parallel-size 1 | |
| --max-model-len 2048 | |
| --max-num-seqs 4 | |
| --gpu-memory-utilization 0.95 | |
| --enforce-eager | |
| --trust-remote-code; | |
| else | |
| python -m vllm.entrypoints.openai.api_server | |
| --model Doradus/MiroThinker-v1.0-30B-FP8 | |
| --host 0.0.0.0 | |
| --port 8000 | |
| --tensor-parallel-size 2 | |
| --max-model-len 32768 | |
| --gpu-memory-utilization 0.90 | |
| --trust-remote-code | |
| --enable-chunked-prefill; | |
| fi | |
| ' | |
| healthcheck: | |
| test: ["CMD", "curl", "-sf", "http://localhost:8000/health"] | |
| interval: 30s | |
| timeout: 10s | |
| retries: 3 | |
| start_period: 120s | |
| volumes: | |
| hf_cache: | |