| # MiroThinker-v1.0-30B-FP8 with vLLM | |
| # Recommended: TP=2 with 2x 24GB+ GPUs | |
| # | |
| # Build: | |
| # docker build -f Dockerfile.vllm -t mirothinker-fp8 . | |
| # | |
| # Run (TP=2, recommended): | |
| # docker run --gpus all -p 8000:8000 mirothinker-fp8 | |
| # | |
| # Run (single GPU, not recommended): | |
| # docker run --gpus '"device=0"' -p 8000:8000 -e SINGLE_GPU=1 mirothinker-fp8 | |
| FROM vllm/vllm-openai:v0.11.2 | |
| # Download model from HuggingFace on first run | |
| ENV MODEL_ID="Doradus/MiroThinker-v1.0-30B-FP8" | |
| ENV HOST="0.0.0.0" | |
| ENV PORT="8000" | |
| # Default: TP=2 for 2x GPUs (recommended) | |
| ENV TENSOR_PARALLEL_SIZE="2" | |
| ENV MAX_MODEL_LEN="32768" | |
| ENV GPU_MEMORY_UTILIZATION="0.90" | |
| # For single GPU fallback (set SINGLE_GPU=1) | |
| ENV SINGLE_GPU="" | |
| EXPOSE 8000 | |
| CMD if [ -n "$SINGLE_GPU" ]; then \ | |
| echo "WARNING: Single GPU mode - limited context and throughput" && \ | |
| python -m vllm.entrypoints.openai.api_server \ | |
| --model $MODEL_ID \ | |
| --host $HOST \ | |
| --port $PORT \ | |
| --tensor-parallel-size 1 \ | |
| --max-model-len 2048 \ | |
| --max-num-seqs 4 \ | |
| --gpu-memory-utilization 0.95 \ | |
| --enforce-eager \ | |
| --trust-remote-code; \ | |
| else \ | |
| python -m vllm.entrypoints.openai.api_server \ | |
| --model $MODEL_ID \ | |
| --host $HOST \ | |
| --port $PORT \ | |
| --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ | |
| --max-model-len $MAX_MODEL_LEN \ | |
| --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ | |
| --trust-remote-code \ | |
| --enable-chunked-prefill; \ | |
| fi | |