LLDDWW Claude commited on
Commit
f16cb1a
ยท
1 Parent(s): 39446f7

feat: switch to PaddleOCR for better Korean text recognition

Browse files

- Replace TrOCR with PaddleOCR (Korean model)
- Remove LLM model loading (not used)
- Simplify dependencies to only OCR-related packages
- PaddleOCR provides superior Korean text recognition

๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +19 -50
  2. requirements.txt +3 -9
app.py CHANGED
@@ -1,52 +1,17 @@
1
  import json
2
  import re
3
  from typing import List, Optional, Tuple
 
4
 
5
  import gradio as gr
6
  import spaces
7
- import torch
8
  from PIL import Image
9
- from transformers import VisionEncoderDecoderModel, TrOCRProcessor, AutoTokenizer, AutoModelForCausalLM
10
 
11
- # Stage 1: OCR ๋ชจ๋ธ (ํ•œ๊ตญ์–ด TrOCR๋กœ ๋ฌธ์„œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ)
12
- OCR_MODEL_ID = "ddobokki/ko-trocr"
13
-
14
- # Stage 2: LLM ๋ชจ๋ธ (ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„ ์ถ”์ถœ)
15
- LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
16
-
17
-
18
- def _load_ocr_model():
19
- """TrOCR ๋ชจ๋ธ ๋กœ๋“œ"""
20
- model = VisionEncoderDecoderModel.from_pretrained(
21
- OCR_MODEL_ID,
22
- device_map="auto",
23
- )
24
-
25
- processor = TrOCRProcessor.from_pretrained(OCR_MODEL_ID)
26
- return model, processor
27
-
28
-
29
- def _load_llm_model():
30
- """Qwen2.5 7B ๋ชจ๋ธ ๋กœ๋“œ (8bit ์–‘์žํ™”)"""
31
- model = AutoModelForCausalLM.from_pretrained(
32
- LLM_MODEL_ID,
33
- device_map="auto",
34
- load_in_8bit=True,
35
- torch_dtype=torch.float16,
36
- trust_remote_code=True,
37
- )
38
-
39
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
40
- return model, tokenizer
41
-
42
-
43
- print("๐Ÿ”„ Loading Korean TrOCR model (ddobokki/ko-trocr)...")
44
- OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
45
- print("โœ… Korean TrOCR model loaded!")
46
-
47
- print("๐Ÿ”„ Loading Qwen2.5-7B-Instruct...")
48
- LLM_MODEL, LLM_TOKENIZER = _load_llm_model()
49
- print("โœ… LLM model loaded!")
50
 
51
 
52
  def _extract_assistant_content(decoded: str) -> str:
@@ -67,17 +32,21 @@ def _extract_json_block(text: str) -> Optional[str]:
67
 
68
 
69
  def extract_text_from_image(image: Image.Image) -> str:
70
- """Stage 1: TrOCR๋กœ ์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ (OCR)"""
71
  try:
72
- # TrOCR์€ ์ด๋ฏธ์ง€ ์ „์ฒด๋ฅผ ํ•œ ๋ฒˆ์— ์ฒ˜๋ฆฌ
73
- pixel_values = OCR_PROCESSOR(image, return_tensors="pt").pixel_values
74
- pixel_values = pixel_values.to(device=OCR_MODEL.device, dtype=OCR_MODEL.dtype)
75
 
76
- with torch.no_grad():
77
- generated_ids = OCR_MODEL.generate(pixel_values)
78
 
79
- extracted_text = OCR_PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
80
- return extracted_text.strip()
 
 
 
 
 
81
 
82
  except Exception as e:
83
  raise Exception(f"OCR ์˜ค๋ฅ˜: {str(e)}")
@@ -297,7 +266,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
297
  ---
298
 
299
  **โ„น๏ธ OCR ๋ชจ๋ธ**
300
- - Korean TrOCR (ddobokki/ko-trocr) - ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ์ธ์‹์— ์ตœ์ ํ™”๋œ ๋ชจ๋ธ
301
  """)
302
 
303
  if __name__ == "__main__":
 
1
  import json
2
  import re
3
  from typing import List, Optional, Tuple
4
+ import numpy as np
5
 
6
  import gradio as gr
7
  import spaces
 
8
  from PIL import Image
9
+ from paddleocr import PaddleOCR
10
 
11
+ # PaddleOCR ์ดˆ๊ธฐํ™” (ํ•œ๊ตญ์–ด)
12
+ print("๐Ÿ”„ Loading PaddleOCR (Korean)...")
13
+ OCR_MODEL = PaddleOCR(use_angle_cls=True, lang='korean', use_gpu=True)
14
+ print("โœ… PaddleOCR loaded!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def _extract_assistant_content(decoded: str) -> str:
 
32
 
33
 
34
  def extract_text_from_image(image: Image.Image) -> str:
35
+ """PaddleOCR๋กœ ์ด๋ฏธ์ง€์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ"""
36
  try:
37
+ # PIL Image๋ฅผ numpy array๋กœ ๋ณ€ํ™˜
38
+ img_array = np.array(image)
 
39
 
40
+ # PaddleOCR ์‹คํ–‰
41
+ result = OCR_MODEL.ocr(img_array, cls=True)
42
 
43
+ # ๊ฒฐ๊ณผ์—์„œ ํ…์ŠคํŠธ๋งŒ ์ถ”์ถœ
44
+ if result and result[0]:
45
+ texts = [line[1][0] for line in result[0]]
46
+ extracted_text = "\n".join(texts)
47
+ return extracted_text.strip()
48
+ else:
49
+ return "ํ…์ŠคํŠธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
50
 
51
  except Exception as e:
52
  raise Exception(f"OCR ์˜ค๋ฅ˜: {str(e)}")
 
266
  ---
267
 
268
  **โ„น๏ธ OCR ๋ชจ๋ธ**
269
+ - PaddleOCR (Korean) - ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ์ธ์‹์— ์ตœ์ ํ™”๋œ OCR ์—”์ง„
270
  """)
271
 
272
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,11 +1,5 @@
1
- transformers>=4.46.0
2
- torch>=2.1.0
3
- accelerate>=0.25.0
4
- einops
5
  gradio>=4.0.0
 
 
6
  Pillow
7
- sentencepiece
8
- torchvision
9
- qwen-vl-utils
10
- bitsandbytes>=0.41.0
11
- scipy
 
 
 
 
 
1
  gradio>=4.0.0
2
+ paddleocr
3
+ paddlepaddle-gpu
4
  Pillow
5
+ numpy