Maheen001 commited on
Commit
3fe84f5
·
verified ·
1 Parent(s): 052a7b4

Create tools/ocr_server.py

Browse files
Files changed (1) hide show
  1. tools/ocr_server.py +79 -0
tools/ocr_server.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from pathlib import Path
3
+ from PIL import Image
4
+ import pytesseract
5
+ import easyocr
6
+ import io
7
+
8
+
9
+ async def extract_text_ocr(file_path: str, language: str = 'en') -> dict:
10
+ """
11
+ Extract text from image using OCR
12
+
13
+ Args:
14
+ file_path: Path to image file
15
+ language: Language code for OCR
16
+
17
+ Returns:
18
+ Dict with extracted text and confidence
19
+ """
20
+ try:
21
+ # Try EasyOCR first (better quality)
22
+ reader = easyocr.Reader([language], gpu=False)
23
+ result = reader.readtext(file_path)
24
+
25
+ text_lines = [detection[1] for detection in result]
26
+ confidences = [detection[2] for detection in result]
27
+
28
+ full_text = '\n'.join(text_lines)
29
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
30
+
31
+ return {
32
+ 'text': full_text,
33
+ 'confidence': avg_confidence,
34
+ 'line_count': len(text_lines),
35
+ 'method': 'easyocr'
36
+ }
37
+
38
+ except Exception as e:
39
+ # Fallback to Tesseract
40
+ try:
41
+ img = Image.open(file_path)
42
+ text = pytesseract.image_to_string(img, lang=language)
43
+
44
+ return {
45
+ 'text': text,
46
+ 'confidence': 0.8, # Estimated
47
+ 'line_count': len(text.split('\n')),
48
+ 'method': 'tesseract'
49
+ }
50
+ except Exception as e2:
51
+ return {
52
+ 'text': '',
53
+ 'error': f"OCR failed: {str(e)}, {str(e2)}",
54
+ 'confidence': 0
55
+ }
56
+
57
+
58
+ async def extract_text_from_pdf_image(pdf_path: str, page_num: int = 0) -> dict:
59
+ """Extract text from scanned PDF using OCR"""
60
+ try:
61
+ from pdf2image import convert_from_path
62
+
63
+ images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
64
+
65
+ if not images:
66
+ return {'text': '', 'error': 'No pages found'}
67
+
68
+ # Save temp image
69
+ temp_path = f"/tmp/page_{page_num}.png"
70
+ images[0].save(temp_path, 'PNG')
71
+
72
+ # Extract text
73
+ result = await extract_text_ocr(temp_path)
74
+
75
+ return result
76
+
77
+ except Exception as e:
78
+ return {'text': '', 'error': str(e)}
79
+