docintel-ai-extractor / ocr_extractor.py
hmnshudhmn24's picture
Upload 14 files
1108401 verified
"""OCR extraction using pdf2image + pytesseract for scanned pages."""
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from utils import load_config
import os
def pdf_to_images(pdf_path, dpi=None, out_dir=None):
cfg = load_config()
dpi = dpi or cfg.get('ocr', {}).get('dpi', 300)
pages = convert_from_path(pdf_path, dpi=dpi)
paths = []
if out_dir:
os.makedirs(out_dir, exist_ok=True)
for i, img in enumerate(pages, start=1):
path = os.path.join(out_dir or '.', f'page_{i}.png')
img.save(path, 'PNG')
paths.append(path)
return paths
def ocr_image(path, lang=None):
cfg = load_config()
lang = lang or cfg.get('ocr', {}).get('lang', 'eng')
img = Image.open(path)
text = pytesseract.image_to_string(img, lang=lang)
return text
def extract_full_text(pdf_path, do_ocr=True):
# Try embedded text first
try:
from pdf_loader import extract_text_from_pdf
txt = extract_text_from_pdf(pdf_path)
if txt and len(txt) > 200:
return txt, [] # return text and empty ocr pages list
except Exception:
txt = ''
# fallback to OCR
pages = pdf_to_images(pdf_path, out_dir='./temp_pages')
ocr_texts = []
for p in pages:
ocr_texts.append(ocr_image(p))
full = '\n\n'.join(ocr_texts)
return full, ocr_texts