ocr / ocr_engines.py
Beijuka's picture
Upload folder using huggingface_hub
0f922c9 verified
"""OCR engine initializers and runners with safer Tesseract handling."""
import os
import sys
import tempfile
import numpy as np
try:
import easyocr
except Exception:
easyocr = None
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
except Exception:
DocumentFile = None
ocr_predictor = None
try:
from paddleocr import PaddleOCR
except Exception:
PaddleOCR = None
try:
import pytesseract
except Exception:
pytesseract = None
try:
import cv2
except Exception:
cv2 = None
def initialize_ocr_models(ocr_models, language_code, device):
ocr_readers = {}
if "EasyOCR" in ocr_models and easyocr is not None:
ocr_readers["EasyOCR"] = easyocr.Reader(
[language_code], gpu=(device == "GPU (CUDA)")
)
if "DocTR" in ocr_models and ocr_predictor is not None:
ocr_readers["DocTR"] = ocr_predictor(pretrained=True)
if "PaddleOCR" in ocr_models and PaddleOCR is not None:
use_gpu = True if device == "GPU (CUDA)" else False
ocr_readers["PaddleOCR"] = PaddleOCR(lang=language_code, use_gpu=use_gpu)
# Tesseract: only set executable path for known Windows locations; on Unix, assume tesseract is on PATH
if "Tesseract" in ocr_models and pytesseract is not None:
if sys.platform.startswith("win"):
# common Windows installation path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
else:
# check common unix paths and set if tesseract binary exists there
for p in ("/usr/bin/tesseract", "/usr/local/bin/tesseract"):
if os.path.exists(p):
pytesseract.pytesseract.tesseract_cmd = p
break
return ocr_readers
def perform_ocr(model_name, ocr_readers, image, language_code):
text = ""
if model_name == "EasyOCR":
reader = ocr_readers.get("EasyOCR")
if reader is None:
return "[EasyOCR not available]"
result = reader.readtext(np.array(image))
text = "\n".join([res[1] for res in result])
elif model_name == "DocTR":
predictor = ocr_readers.get("DocTR")
if predictor is None or DocumentFile is None:
return "[DocTR not available]"
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
image.save(tmp_file, format="PNG")
file_path = tmp_file.name
doc = DocumentFile.from_images(file_path)
result = predictor(doc)
# Safely iterate pages/blocks
pages = []
for page in result.pages:
page_text_blocks = []
for block in page.blocks:
lines = [" ".join([word.value for word in line.words]) for line in block.lines]
page_text_blocks.append("\n".join(lines))
pages.append("\n\n".join(page_text_blocks))
text = "\n\n".join(pages)
try:
os.unlink(file_path)
except Exception:
pass
elif model_name == "PaddleOCR":
reader = ocr_readers.get("PaddleOCR")
if reader is None:
return "[PaddleOCR not available]"
result = reader.ocr(np.array(image))
# result may be empty or structured per line
try:
text = "\n".join([line[1][0] for line in result[0]])
except Exception:
# fallback: join any text tokens found
tokens = []
for page in result:
for line in page:
if len(line) > 1 and isinstance(line[1], (list, tuple)):
tokens.append(line[1][0])
text = "\n".join(tokens)
elif model_name == "Tesseract":
if pytesseract is None:
return "[pytesseract not available]"
# Convert PIL image to RGB if not already
try:
if image.mode != "RGB":
image = image.convert("RGB")
except Exception:
pass
# Convert image to OpenCV format if cv2 is available
if cv2 is not None:
opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
else:
# fallback: use raw numpy array
opencv_image = np.array(image)
config = f"--oem 3 --psm 6 -l {language_code}"
try:
text = pytesseract.image_to_string(opencv_image) # , config=config
except Exception as e:
text = f"[Tesseract error: {e}]"
return text