merterbak commited on
Commit
04db102
Β·
verified Β·
1 Parent(s): b641ecb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +277 -0
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ import torch
4
+ import spaces
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ import shutil
9
+ from PIL import Image, ImageDraw, ImageFont, ImageOps
10
+ import fitz
11
+ import re
12
+ import warnings
13
+ import numpy as np
14
+ import base64
15
+ from io import StringIO, BytesIO
16
+
17
+ MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
20
+ model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
21
+ model = model.eval()
22
+
23
+ MODEL_CONFIGS = {
24
+ "⚑ Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
25
+ "πŸš€ Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
26
+ "πŸ“„ Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
27
+ "πŸ“Š Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
28
+ "🎯 Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
29
+ }
30
+
31
+ TASK_PROMPTS = {
32
+ "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
33
+ "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
34
+ "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
35
+ "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
36
+ "✏️ Custom": {"prompt": "", "has_grounding": False}
37
+ }
38
+
39
+ def extract_grounding_references(text):
40
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
41
+ return re.findall(pattern, text, re.DOTALL)
42
+
43
+ def draw_bounding_boxes(image, refs, extract_images=False):
44
+ img_w, img_h = image.size
45
+ img_draw = image.copy()
46
+ draw = ImageDraw.Draw(img_draw)
47
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
48
+ draw2 = ImageDraw.Draw(overlay)
49
+ font = ImageFont.load_default()
50
+ crops = []
51
+
52
+ for ref in refs:
53
+ label = ref[1]
54
+ coords = eval(ref[2])
55
+ color = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
56
+ color_a = color + (60,)
57
+
58
+ for box in coords:
59
+ x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
60
+
61
+ if extract_images and label == 'image':
62
+ crops.append(image.crop((x1, y1, x2, y2)))
63
+
64
+ width = 5 if label == 'title' else 3
65
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
66
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a)
67
+
68
+ text_bbox = draw.textbbox((0, 0), label, font=font)
69
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
70
+ ty = max(0, y1 - 20)
71
+ draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
72
+ draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
73
+
74
+ img_draw.paste(overlay, (0, 0), overlay)
75
+ return img_draw, crops
76
+
77
+ def clean_output(text, include_images=False, remove_labels=False):
78
+ if not text:
79
+ return ""
80
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
81
+ matches = re.findall(pattern, text, re.DOTALL)
82
+ img_num = 0
83
+
84
+ for match in matches:
85
+ if '<|ref|>image<|/ref|>' in match[0]:
86
+ if include_images:
87
+ text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
88
+ img_num += 1
89
+ else:
90
+ text = text.replace(match[0], '', 1)
91
+ else:
92
+ if remove_labels:
93
+ text = text.replace(match[0], '', 1)
94
+ else:
95
+ text = text.replace(match[0], match[1], 1)
96
+
97
+ return text.strip()
98
+
99
+ def embed_images(markdown, crops):
100
+ if not crops:
101
+ return markdown
102
+ for i, img in enumerate(crops):
103
+ buf = BytesIO()
104
+ img.save(buf, format="PNG")
105
+ b64 = base64.b64encode(buf.getvalue()).decode()
106
+ markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
107
+ return markdown
108
+
109
+ @spaces.GPU(duration=60)
110
+ def process_image(image, mode, task, custom_prompt):
111
+ if image is None:
112
+ return " Error Upload image", "", "", None, []
113
+ if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
114
+ return "Enter prompt", "", "", None, []
115
+
116
+ if image.mode in ('RGBA', 'LA', 'P'):
117
+ image = image.convert('RGB')
118
+ image = ImageOps.exif_transpose(image)
119
+
120
+ config = MODEL_CONFIGS[mode]
121
+
122
+ if task == "✏️ Custom":
123
+ prompt = f"<image>\n{custom_prompt.strip()}"
124
+ has_grounding = '<|grounding|>' in custom_prompt
125
+ elif task == "πŸ“ Locate":
126
+ prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
127
+ has_grounding = True
128
+ else:
129
+ prompt = TASK_PROMPTS[task]["prompt"]
130
+ has_grounding = TASK_PROMPTS[task]["has_grounding"]
131
+
132
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
133
+ image.save(tmp.name, 'JPEG', quality=95)
134
+ tmp.close()
135
+ out_dir = tempfile.mkdtemp()
136
+
137
+ stdout = sys.stdout
138
+ sys.stdout = StringIO()
139
+
140
+ model.infer(tokenizer=tokenizer, prompt=prompt, image_file=tmp.name, output_path=out_dir,
141
+ base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
142
+
143
+ result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
144
+ if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
145
+ sys.stdout = stdout
146
+
147
+ os.unlink(tmp.name)
148
+ shutil.rmtree(out_dir, ignore_errors=True)
149
+
150
+ if not result:
151
+ return "No text", "", "", None, []
152
+
153
+ cleaned = clean_output(result, False, False)
154
+ markdown = clean_output(result, True, True)
155
+
156
+ img_out = None
157
+ crops = []
158
+
159
+ if has_grounding and '<|ref|>' in result:
160
+ refs = extract_grounding_references(result)
161
+ if refs:
162
+ img_out, crops = draw_bounding_boxes(image, refs, True)
163
+
164
+ markdown = embed_images(markdown, crops)
165
+
166
+ return cleaned, markdown, result, img_out, crops
167
+
168
+ @spaces.GPU(duration=300)
169
+ def process_pdf(path, mode, task, custom_prompt):
170
+ doc = fitz.open(path)
171
+ texts, markdowns, raws, all_crops = [], [], [], []
172
+
173
+ for i in range(len(doc)):
174
+ page = doc.load_page(i)
175
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
176
+ img = Image.open(BytesIO(pix.tobytes("png")))
177
+
178
+ text, md, raw, _, crops = process_image(img, mode, task, custom_prompt)
179
+
180
+ if text and text != "No text":
181
+ texts.append(f"### Page {i + 1}\n\n{text}")
182
+ markdowns.append(f"### Page {i + 1}\n\n{md}")
183
+ raws.append(f"=== Page {i + 1} ===\n{raw}")
184
+ all_crops.extend(crops)
185
+
186
+ doc.close()
187
+
188
+ return ("\n\n---\n\n".join(texts) if texts else "No text in PDF",
189
+ "\n\n---\n\n".join(markdowns) if markdowns else "No text in PDF",
190
+ "\n\n".join(raws), None, all_crops)
191
+
192
+ def process_file(path, mode, task, custom_prompt):
193
+ if not path:
194
+ return "Error Upload file", "", "", None, []
195
+
196
+ if path.lower().endswith('.pdf'):
197
+ return process_pdf(path, mode, task, custom_prompt)
198
+ else:
199
+ return process_image(Image.open(path), mode, task, custom_prompt)
200
+
201
+ def toggle_prompt(task):
202
+ if task == "✏️ Custom":
203
+ return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
204
+ elif task == "πŸ“ Locate":
205
+ return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
206
+ return gr.update(visible=False)
207
+
208
+ def load_image(file_path):
209
+ if not file_path:
210
+ return None
211
+ if file_path.lower().endswith('.pdf'):
212
+ doc = fitz.open(file_path)
213
+ page = doc.load_page(0)
214
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
215
+ img = Image.open(BytesIO(pix.tobytes("png")))
216
+ doc.close()
217
+ return img
218
+ else:
219
+ return Image.open(file_path)
220
+
221
+ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
222
+ gr.Markdown("# πŸš€ DeepSeek-OCR\n**Convert documents to markdown, extract raw text, and locate specific content with bounding boxes**")
223
+
224
+ with gr.Row():
225
+ with gr.Column(scale=1):
226
+ file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
227
+ input_img = gr.Image(label="Input Image", type="pil", height=300)
228
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="⚑ Gundam", label="Mode")
229
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
230
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
231
+ btn = gr.Button("Extract", variant="primary", size="lg")
232
+
233
+ with gr.Column(scale=2):
234
+ with gr.Tabs():
235
+ with gr.Tab("πŸ“ Text"):
236
+ text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
237
+ with gr.Tab("🎨 Markdown"):
238
+ md_out = gr.Markdown("")
239
+ with gr.Tab("πŸ–ΌοΈ Boxes"):
240
+ img_out = gr.Image(type="pil", height=500, show_label=False)
241
+ with gr.Tab("πŸ–ΌοΈ Figures"):
242
+ gallery = gr.Gallery(show_label=False, columns=3, height=400)
243
+ with gr.Tab("πŸ” Raw"):
244
+ raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
245
+
246
+ gr.Examples(
247
+ examples=[
248
+ ["ocr.jpg", "⚑ Gundam", "πŸ“‹ Markdown", ""],
249
+ ["reachy-mini.jpg", "⚑ Gundam", "πŸ“ Locate", "Robot"]
250
+ ],
251
+ inputs=[file_in, mode, task, prompt],
252
+ cache_examples=False
253
+ )
254
+
255
+ with gr.Accordion("ℹ️ Info", open=False):
256
+ gr.Markdown("""
257
+ ### Modes
258
+ - **Gundam**: 1024 base + 640 tiles with cropping - Best balance
259
+ - **Tiny**: 512Γ—512, no crop - Fastest
260
+ - **Small**: 640Γ—640, no crop - Quick
261
+ - **Base**: 1024Γ—1024, no crop - Standard
262
+ - **Large**: 1280Γ—1280, no crop - Highest quality
263
+
264
+ ### Tasks
265
+ - **Markdown**: Convert document to structured markdown (grounding βœ…)
266
+ - **Free OCR**: Simple text extraction
267
+ - **Locate**: Find specific text in image (grounding βœ…)
268
+ - **Describe**: General image description
269
+ - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
270
+ """)
271
+
272
+ file_in.change(load_image, [file_in], [input_img])
273
+ task.change(toggle_prompt, [task], [prompt])
274
+ btn.click(process_file, [file_in, mode, task, prompt], [text_out, md_out, raw_out, img_out, gallery])
275
+
276
+ if __name__ == "__main__":
277
+ demo.queue(max_size=20).launch()