Spaces:

Beijuka
/

ocr

Configuration error

App Files Files Community

ocr / ocr_engines.py

Beijuka

Upload folder using huggingface_hub

0f922c9 verified about 2 months ago

raw

history blame contribute delete

4.54 kB

	"""OCR engine initializers and runners with safer Tesseract handling."""

	import os
	import sys
	import tempfile
	import numpy as np

	try:
	import easyocr
	except Exception:
	easyocr = None

	try:
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor
	except Exception:
	DocumentFile = None
	ocr_predictor = None

	try:
	from paddleocr import PaddleOCR
	except Exception:
	PaddleOCR = None

	try:
	import pytesseract
	except Exception:
	pytesseract = None

	try:
	import cv2
	except Exception:
	cv2 = None


	def initialize_ocr_models(ocr_models, language_code, device):
	ocr_readers = {}

	if "EasyOCR" in ocr_models and easyocr is not None:
	ocr_readers["EasyOCR"] = easyocr.Reader(
	[language_code], gpu=(device == "GPU (CUDA)")
	)

	if "DocTR" in ocr_models and ocr_predictor is not None:
	ocr_readers["DocTR"] = ocr_predictor(pretrained=True)

	if "PaddleOCR" in ocr_models and PaddleOCR is not None:
	use_gpu = True if device == "GPU (CUDA)" else False
	ocr_readers["PaddleOCR"] = PaddleOCR(lang=language_code, use_gpu=use_gpu)

	# Tesseract: only set executable path for known Windows locations; on Unix, assume tesseract is on PATH
	if "Tesseract" in ocr_models and pytesseract is not None:
	if sys.platform.startswith("win"):
	# common Windows installation path
	pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
	else:
	# check common unix paths and set if tesseract binary exists there
	for p in ("/usr/bin/tesseract", "/usr/local/bin/tesseract"):
	if os.path.exists(p):
	pytesseract.pytesseract.tesseract_cmd = p
	break

	return ocr_readers


	def perform_ocr(model_name, ocr_readers, image, language_code):
	text = ""

	if model_name == "EasyOCR":
	reader = ocr_readers.get("EasyOCR")
	if reader is None:
	return "[EasyOCR not available]"
	result = reader.readtext(np.array(image))
	text = "\n".join([res[1] for res in result])

	elif model_name == "DocTR":
	predictor = ocr_readers.get("DocTR")
	if predictor is None or DocumentFile is None:
	return "[DocTR not available]"
	with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
	image.save(tmp_file, format="PNG")
	file_path = tmp_file.name
	doc = DocumentFile.from_images(file_path)
	result = predictor(doc)
	# Safely iterate pages/blocks
	pages = []
	for page in result.pages:
	page_text_blocks = []
	for block in page.blocks:
	lines = [" ".join([word.value for word in line.words]) for line in block.lines]
	page_text_blocks.append("\n".join(lines))
	pages.append("\n\n".join(page_text_blocks))
	text = "\n\n".join(pages)
	try:
	os.unlink(file_path)
	except Exception:
	pass

	elif model_name == "PaddleOCR":
	reader = ocr_readers.get("PaddleOCR")
	if reader is None:
	return "[PaddleOCR not available]"
	result = reader.ocr(np.array(image))
	# result may be empty or structured per line
	try:
	text = "\n".join([line[1][0] for line in result[0]])
	except Exception:
	# fallback: join any text tokens found
	tokens = []
	for page in result:
	for line in page:
	if len(line) > 1 and isinstance(line[1], (list, tuple)):
	tokens.append(line[1][0])
	text = "\n".join(tokens)

	elif model_name == "Tesseract":
	if pytesseract is None:
	return "[pytesseract not available]"
	# Convert PIL image to RGB if not already
	try:
	if image.mode != "RGB":
	image = image.convert("RGB")
	except Exception:
	pass
	# Convert image to OpenCV format if cv2 is available
	if cv2 is not None:
	opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
	else:
	# fallback: use raw numpy array
	opencv_image = np.array(image)
	config = f"--oem 3 --psm 6 -l {language_code}"
	try:
	text = pytesseract.image_to_string(opencv_image) # , config=config
	except Exception as e:
	text = f"[Tesseract error: {e}]"

	return text