johnisafridge commited on
Commit
dc3c3e6
·
verified ·
1 Parent(s): e235d17

CPU only setup

Browse files
Files changed (1) hide show
  1. app.py +59 -34
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import gradio as gr
3
- import spaces
4
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
5
  from qwen_vl_utils import process_vision_info
6
  import torch
@@ -9,47 +8,56 @@ from PIL import Image, ImageDraw
9
  from io import BytesIO
10
  import re
11
 
 
 
 
12
 
13
- # ---- HF Spaces: ensure we read the platform port ----
 
 
 
 
14
  models = {
15
  "OS-Copilot/OS-Atlas-Base-7B": Qwen2VLForConditionalGeneration.from_pretrained(
16
  "OS-Copilot/OS-Atlas-Base-7B",
17
- torch_dtype="auto",
18
- device_map="auto",
19
- ),
20
  }
21
 
22
  processors = {
23
  "OS-Copilot/OS-Atlas-Base-7B": AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
24
  }
25
 
26
-
27
  def image_to_base64(image: Image.Image) -> str:
28
  buffered = BytesIO()
29
  image.save(buffered, format="PNG")
30
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
31
 
32
-
33
  def draw_bounding_boxes(image: Image.Image, bounding_boxes, outline_color="red", line_width=2):
34
  draw = ImageDraw.Draw(image)
35
- for box in bounding_boxes:
36
  xmin, ymin, xmax, ymax = box
37
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
38
  return image
39
 
40
-
41
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
 
 
42
  x_scale = original_width / scaled_width
43
  y_scale = original_height / scaled_height
44
- rescaled_boxes = []
45
- for box in bounding_boxes:
46
- xmin, ymin, xmax, ymax = box
47
- rescaled_boxes.append([xmin * x_scale, ymin * y_scale, xmax * x_scale, ymax * y_scale])
48
- return rescaled_boxes
49
-
50
 
51
- @spaces.GPU
52
  def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
 
 
 
 
53
  model = models[model_id].eval()
54
  processor = processors[model_id]
55
 
@@ -64,6 +72,7 @@ def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
64
  }
65
  ]
66
 
 
67
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
68
  image_inputs, video_inputs = process_vision_info(messages)
69
  inputs = processor(
@@ -72,32 +81,43 @@ def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
72
  videos=video_inputs,
73
  padding=True,
74
  return_tensors="pt",
75
- ).to("cuda")
 
 
 
76
 
77
- generated_ids = model.generate(**inputs, max_new_tokens=128)
78
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
79
- output_text = processor.batch_decode(
 
 
 
 
80
  generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
81
  )
82
- text = output_text[0]
83
 
84
- # ---- simple, defensive parsing so the Space doesn't 500 if pattern not found ----
85
  object_ref_pattern = r"<\|object_ref_start\|>(.*?)<\|object_ref_end\|>"
86
  box_pattern = r"<\|box_start\|>(.*?)<\|box_end\|>"
87
 
88
  object_match = re.search(object_ref_pattern, text or "")
89
  box_match = re.search(box_pattern, text or "")
90
 
91
- object_ref = object_match.group(1) if object_match else ""
92
- box_content = box_match.group(1) if box_match else ""
93
 
94
  boxes = []
95
  if box_content:
96
  try:
97
- parsed = [tuple(map(int, pair.strip("()").split(","))) for pair in box_content.split("),(")]
98
- # expecting two points -> convert to [xmin, ymin, xmax, ymax]
99
- if len(parsed) >= 2:
100
- boxes = [[parsed[0][0], parsed[0][1], parsed[1][0], parsed[1][1]]]
 
 
 
 
101
  except Exception:
102
  boxes = []
103
 
@@ -106,7 +126,7 @@ def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
106
 
107
  return object_ref, scaled_boxes, annotated
108
 
109
-
110
  css = """
111
  #output {
112
  height: 500px;
@@ -141,8 +161,8 @@ with gr.Blocks(css=css) as demo:
141
  inputs=[input_img, text_input],
142
  outputs=[model_output_text, model_output_box, annotated_image],
143
  fn=run_example,
144
- cache_examples=False,
145
- label="Try examples",
146
  )
147
 
148
  submit_btn.click(
@@ -151,6 +171,11 @@ with gr.Blocks(css=css) as demo:
151
  [model_output_text, model_output_box, annotated_image],
152
  )
153
 
154
- # ---- HF Spaces: bind to all interfaces + use provided port; disable API schema to avoid json-schema bug ----
155
- PORT = int(os.getenv("PORT", "7860"))
156
- demo.queue(api_open=False).launch(server_name="0.0.0.0", server_port=PORT, show_error=True, debug=True)
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
 
3
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
  import torch
 
8
  from io import BytesIO
9
  import re
10
 
11
+ # -------- Runtime / device --------
12
+ # Force CPU usage
13
+ device = "cpu"
14
 
15
+ # Hugging Face Spaces port
16
+ PORT = int(os.getenv("PORT", "7860"))
17
+
18
+ # -------- Model / Processor --------
19
+ # NOTE: device_map=None + .to(device) keeps everything on CPU
20
  models = {
21
  "OS-Copilot/OS-Atlas-Base-7B": Qwen2VLForConditionalGeneration.from_pretrained(
22
  "OS-Copilot/OS-Atlas-Base-7B",
23
+ dtype="auto", # use 'dtype' (new) rather than deprecated 'torch_dtype'
24
+ device_map=None
25
+ ).to(device)
26
  }
27
 
28
  processors = {
29
  "OS-Copilot/OS-Atlas-Base-7B": AutoProcessor.from_pretrained("OS-Copilot/OS-Atlas-Base-7B")
30
  }
31
 
32
+ # -------- Helpers --------
33
  def image_to_base64(image: Image.Image) -> str:
34
  buffered = BytesIO()
35
  image.save(buffered, format="PNG")
36
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
37
 
 
38
  def draw_bounding_boxes(image: Image.Image, bounding_boxes, outline_color="red", line_width=2):
39
  draw = ImageDraw.Draw(image)
40
+ for box in bounding_boxes or []:
41
  xmin, ymin, xmax, ymax = box
42
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
43
  return image
44
 
 
45
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
46
+ if not bounding_boxes:
47
+ return []
48
  x_scale = original_width / scaled_width
49
  y_scale = original_height / scaled_height
50
+ return [
51
+ [xmin * x_scale, ymin * y_scale, xmax * x_scale, ymax * y_scale]
52
+ for (xmin, ymin, xmax, ymax) in bounding_boxes
53
+ ]
 
 
54
 
55
+ # -------- Inference --------
56
  def run_example(image, text_input, model_id="OS-Copilot/OS-Atlas-Base-7B"):
57
+ # Basic validation so the Space doesn't 500
58
+ if image is None or (text_input is None or str(text_input).strip() == ""):
59
+ return "", [], image
60
+
61
  model = models[model_id].eval()
62
  processor = processors[model_id]
63
 
 
72
  }
73
  ]
74
 
75
+ # Build inputs
76
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
77
  image_inputs, video_inputs = process_vision_info(messages)
78
  inputs = processor(
 
81
  videos=video_inputs,
82
  padding=True,
83
  return_tensors="pt",
84
+ )
85
+
86
+ # Move tensors to CPU explicitly
87
+ inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}
88
 
89
+ # Generate
90
+ with torch.no_grad():
91
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
92
+
93
+ # Post-process
94
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)]
95
+ output_texts = processor.batch_decode(
96
  generated_ids_trimmed, skip_special_tokens=False, clean_up_tokenization_spaces=False
97
  )
98
+ text = output_texts[0] if output_texts else ""
99
 
100
+ # Parse object_ref and bbox defensively
101
  object_ref_pattern = r"<\|object_ref_start\|>(.*?)<\|object_ref_end\|>"
102
  box_pattern = r"<\|box_start\|>(.*?)<\|box_end\|>"
103
 
104
  object_match = re.search(object_ref_pattern, text or "")
105
  box_match = re.search(box_pattern, text or "")
106
 
107
+ object_ref = object_match.group(1).strip() if object_match else ""
108
+ box_content = box_match.group(1).strip() if box_match else ""
109
 
110
  boxes = []
111
  if box_content:
112
  try:
113
+ # Expecting "(x1,y1),(x2,y2)" -> convert to [xmin, ymin, xmax, ymax]
114
+ parts = [p.strip() for p in box_content.split("),(")]
115
+ parts[0] = parts[0].lstrip("(")
116
+ parts[-1] = parts[-1].rstrip(")")
117
+ coords = [tuple(map(int, p.split(","))) for p in parts]
118
+ if len(coords) >= 2:
119
+ (x1, y1), (x2, y2) = coords[0], coords[1]
120
+ boxes = [[x1, y1, x2, y2]]
121
  except Exception:
122
  boxes = []
123
 
 
126
 
127
  return object_ref, scaled_boxes, annotated
128
 
129
+ # -------- UI --------
130
  css = """
131
  #output {
132
  height: 500px;
 
161
  inputs=[input_img, text_input],
162
  outputs=[model_output_text, model_output_box, annotated_image],
163
  fn=run_example,
164
+ cache_examples=False, # IMPORTANT: don't run inference at startup
165
+ label="Try examples"
166
  )
167
 
168
  submit_btn.click(
 
171
  [model_output_text, model_output_box, annotated_image],
172
  )
173
 
174
+ # -------- Launch (Spaces-friendly) --------
175
+ # api_open=False avoids known JSON-schema crashes on some versions
176
+ demo.queue(api_open=False).launch(
177
+ server_name="0.0.0.0",
178
+ server_port=PORT,
179
+ show_error=True,
180
+ debug=True
181
+ )