ChickenMcSwag commited on
Commit
0a902de
·
1 Parent(s): b6a2854
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. requirements.txt +1 -1
  3. server.py +47 -14
Dockerfile CHANGED
@@ -6,8 +6,8 @@ RUN apt-get update && apt-get install -y wget git && rm -rf /var/lib/apt/lists/*
6
  ENV HOME=/app
7
  # Use a writable directory for HF caches on Spaces
8
  ENV HF_HOME=/data/huggingface
9
- ENV TRANSFORMERS_CACHE=/data/huggingface
10
  ENV HF_HUB_DISABLE_PROGRESS_BARS=1
 
11
 
12
  WORKDIR /app
13
 
 
6
  ENV HOME=/app
7
  # Use a writable directory for HF caches on Spaces
8
  ENV HF_HOME=/data/huggingface
 
9
  ENV HF_HUB_DISABLE_PROGRESS_BARS=1
10
+ ENV HF_HUB_CACHE=/data/huggingface
11
 
12
  WORKDIR /app
13
 
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  fastapi
2
  uvicorn[standard]
3
  torch
4
- transformers
5
  accelerate
6
  sentencepiece
 
1
  fastapi
2
  uvicorn[standard]
3
  torch
4
+ transformers>=4.43.0
5
  accelerate
6
  sentencepiece
server.py CHANGED
@@ -1,22 +1,50 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
  import torch
6
 
7
  MODEL_ID = "osunlp/UGround-V1-72B"
8
- CACHE_DIR = os.environ.get("TRANSFORMERS_CACHE", "/data/huggingface")
9
-
10
- # Load tokenizer & model (multi-GPU aware via accelerate)
11
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, cache_dir=CACHE_DIR)
12
- model = AutoModelForCausalLM.from_pretrained(
13
- MODEL_ID,
14
- torch_dtype=torch.bfloat16,
15
- device_map="auto", # automatically shards across GPUs
16
- trust_remote_code=True,
17
- cache_dir=CACHE_DIR
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  app = FastAPI()
21
 
22
  # OpenAI-style request schema
@@ -33,9 +61,14 @@ class ChatCompletionRequest(BaseModel):
33
  async def chat_completions(req: ChatCompletionRequest):
34
  # Concatenate messages into one prompt
35
  prompt = "\n".join([m.content for m in req.messages])
36
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
37
- outputs = model.generate(**inputs, max_new_tokens=req.max_tokens)
38
- text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
39
 
40
  return {
41
  "id": "chatcmpl-uground72b",
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
+ from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoProcessor
4
  import os
5
  import torch
6
 
7
  MODEL_ID = "osunlp/UGround-V1-72B"
8
+ CACHE_DIR = (
9
+ os.environ.get("HF_HUB_CACHE")
10
+ or os.environ.get("HF_HOME")
11
+ or "/data/huggingface"
 
 
 
 
 
 
12
  )
13
 
14
+ # Inspect config and load appropriate stack
15
+ config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True, cache_dir=CACHE_DIR)
16
+
17
+ is_qwen2_vl = getattr(config, "model_type", None) == "qwen2_vl" or (
18
+ config.__class__.__name__.lower().startswith("qwen2vl")
19
+ )
20
+
21
+ if is_qwen2_vl:
22
+ try:
23
+ from transformers import Qwen2VLForConditionalGeneration # type: ignore
24
+ except Exception as e:
25
+ raise RuntimeError(
26
+ "Transformers version does not support Qwen2-VL. Please upgrade transformers to >=4.43."
27
+ ) from e
28
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, cache_dir=CACHE_DIR)
29
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
30
+ MODEL_ID,
31
+ torch_dtype=torch.bfloat16,
32
+ device_map="auto",
33
+ trust_remote_code=True,
34
+ cache_dir=CACHE_DIR,
35
+ )
36
+ _use_processor = True
37
+ else:
38
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, cache_dir=CACHE_DIR)
39
+ model = AutoModelForCausalLM.from_pretrained(
40
+ MODEL_ID,
41
+ torch_dtype=torch.bfloat16,
42
+ device_map="auto", # automatically shards across GPUs
43
+ trust_remote_code=True,
44
+ cache_dir=CACHE_DIR
45
+ )
46
+ _use_processor = False
47
+
48
  app = FastAPI()
49
 
50
  # OpenAI-style request schema
 
61
  async def chat_completions(req: ChatCompletionRequest):
62
  # Concatenate messages into one prompt
63
  prompt = "\n".join([m.content for m in req.messages])
64
+ if _use_processor:
65
+ inputs = processor(text=prompt, return_tensors="pt").to(model.device)
66
+ outputs = model.generate(**inputs, max_new_tokens=req.max_tokens)
67
+ text = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ else:
69
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
70
+ outputs = model.generate(**inputs, max_new_tokens=req.max_tokens)
71
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True)
72
 
73
  return {
74
  "id": "chatcmpl-uground72b",