File size: 4,538 Bytes
0f922c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
"""OCR engine initializers and runners with safer Tesseract handling."""
import os
import sys
import tempfile
import numpy as np
try:
import easyocr
except Exception:
easyocr = None
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
except Exception:
DocumentFile = None
ocr_predictor = None
try:
from paddleocr import PaddleOCR
except Exception:
PaddleOCR = None
try:
import pytesseract
except Exception:
pytesseract = None
try:
import cv2
except Exception:
cv2 = None
def initialize_ocr_models(ocr_models, language_code, device):
ocr_readers = {}
if "EasyOCR" in ocr_models and easyocr is not None:
ocr_readers["EasyOCR"] = easyocr.Reader(
[language_code], gpu=(device == "GPU (CUDA)")
)
if "DocTR" in ocr_models and ocr_predictor is not None:
ocr_readers["DocTR"] = ocr_predictor(pretrained=True)
if "PaddleOCR" in ocr_models and PaddleOCR is not None:
use_gpu = True if device == "GPU (CUDA)" else False
ocr_readers["PaddleOCR"] = PaddleOCR(lang=language_code, use_gpu=use_gpu)
# Tesseract: only set executable path for known Windows locations; on Unix, assume tesseract is on PATH
if "Tesseract" in ocr_models and pytesseract is not None:
if sys.platform.startswith("win"):
# common Windows installation path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
else:
# check common unix paths and set if tesseract binary exists there
for p in ("/usr/bin/tesseract", "/usr/local/bin/tesseract"):
if os.path.exists(p):
pytesseract.pytesseract.tesseract_cmd = p
break
return ocr_readers
def perform_ocr(model_name, ocr_readers, image, language_code):
text = ""
if model_name == "EasyOCR":
reader = ocr_readers.get("EasyOCR")
if reader is None:
return "[EasyOCR not available]"
result = reader.readtext(np.array(image))
text = "\n".join([res[1] for res in result])
elif model_name == "DocTR":
predictor = ocr_readers.get("DocTR")
if predictor is None or DocumentFile is None:
return "[DocTR not available]"
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
image.save(tmp_file, format="PNG")
file_path = tmp_file.name
doc = DocumentFile.from_images(file_path)
result = predictor(doc)
# Safely iterate pages/blocks
pages = []
for page in result.pages:
page_text_blocks = []
for block in page.blocks:
lines = [" ".join([word.value for word in line.words]) for line in block.lines]
page_text_blocks.append("\n".join(lines))
pages.append("\n\n".join(page_text_blocks))
text = "\n\n".join(pages)
try:
os.unlink(file_path)
except Exception:
pass
elif model_name == "PaddleOCR":
reader = ocr_readers.get("PaddleOCR")
if reader is None:
return "[PaddleOCR not available]"
result = reader.ocr(np.array(image))
# result may be empty or structured per line
try:
text = "\n".join([line[1][0] for line in result[0]])
except Exception:
# fallback: join any text tokens found
tokens = []
for page in result:
for line in page:
if len(line) > 1 and isinstance(line[1], (list, tuple)):
tokens.append(line[1][0])
text = "\n".join(tokens)
elif model_name == "Tesseract":
if pytesseract is None:
return "[pytesseract not available]"
# Convert PIL image to RGB if not already
try:
if image.mode != "RGB":
image = image.convert("RGB")
except Exception:
pass
# Convert image to OpenCV format if cv2 is available
if cv2 is not None:
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
else:
# fallback: use raw numpy array
opencv_image = np.array(image)
config = f"--oem 3 --psm 6 -l {language_code}"
try:
text = pytesseract.image_to_string(opencv_image) # , config=config
except Exception as e:
text = f"[Tesseract error: {e}]"
return text
|