Doradus commited on
Commit
81b6ca6
·
verified ·
1 Parent(s): 8f566ff

Upload docker/Dockerfile.vllm with huggingface_hub

Browse files
Files changed (1) hide show
  1. docker/Dockerfile.vllm +52 -0
docker/Dockerfile.vllm ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MiroThinker-v1.0-30B-FP8 with vLLM
2
+ # Recommended: TP=2 with 2x 24GB+ GPUs
3
+ #
4
+ # Build:
5
+ # docker build -f Dockerfile.vllm -t mirothinker-fp8 .
6
+ #
7
+ # Run (TP=2, recommended):
8
+ # docker run --gpus all -p 8000:8000 mirothinker-fp8
9
+ #
10
+ # Run (single GPU, not recommended):
11
+ # docker run --gpus '"device=0"' -p 8000:8000 -e SINGLE_GPU=1 mirothinker-fp8
12
+
13
+ FROM vllm/vllm-openai:v0.11.2
14
+
15
+ # Download model from HuggingFace on first run
16
+ ENV MODEL_ID="Doradus/MiroThinker-v1.0-30B-FP8"
17
+ ENV HOST="0.0.0.0"
18
+ ENV PORT="8000"
19
+
20
+ # Default: TP=2 for 2x GPUs (recommended)
21
+ ENV TENSOR_PARALLEL_SIZE="2"
22
+ ENV MAX_MODEL_LEN="32768"
23
+ ENV GPU_MEMORY_UTILIZATION="0.90"
24
+
25
+ # For single GPU fallback (set SINGLE_GPU=1)
26
+ ENV SINGLE_GPU=""
27
+
28
+ EXPOSE 8000
29
+
30
+ CMD if [ -n "$SINGLE_GPU" ]; then \
31
+ echo "WARNING: Single GPU mode - limited context and throughput" && \
32
+ python -m vllm.entrypoints.openai.api_server \
33
+ --model $MODEL_ID \
34
+ --host $HOST \
35
+ --port $PORT \
36
+ --tensor-parallel-size 1 \
37
+ --max-model-len 2048 \
38
+ --max-num-seqs 4 \
39
+ --gpu-memory-utilization 0.95 \
40
+ --enforce-eager \
41
+ --trust-remote-code; \
42
+ else \
43
+ python -m vllm.entrypoints.openai.api_server \
44
+ --model $MODEL_ID \
45
+ --host $HOST \
46
+ --port $PORT \
47
+ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
48
+ --max-model-len $MAX_MODEL_LEN \
49
+ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
50
+ --trust-remote-code \
51
+ --enable-chunked-prefill; \
52
+ fi