|
|
"""OCR extraction using pdf2image + pytesseract for scanned pages.""" |
|
|
from pdf2image import convert_from_path |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
from utils import load_config |
|
|
import os |
|
|
|
|
|
def pdf_to_images(pdf_path, dpi=None, out_dir=None): |
|
|
cfg = load_config() |
|
|
dpi = dpi or cfg.get('ocr', {}).get('dpi', 300) |
|
|
pages = convert_from_path(pdf_path, dpi=dpi) |
|
|
paths = [] |
|
|
if out_dir: |
|
|
os.makedirs(out_dir, exist_ok=True) |
|
|
for i, img in enumerate(pages, start=1): |
|
|
path = os.path.join(out_dir or '.', f'page_{i}.png') |
|
|
img.save(path, 'PNG') |
|
|
paths.append(path) |
|
|
return paths |
|
|
|
|
|
def ocr_image(path, lang=None): |
|
|
cfg = load_config() |
|
|
lang = lang or cfg.get('ocr', {}).get('lang', 'eng') |
|
|
img = Image.open(path) |
|
|
text = pytesseract.image_to_string(img, lang=lang) |
|
|
return text |
|
|
|
|
|
def extract_full_text(pdf_path, do_ocr=True): |
|
|
|
|
|
try: |
|
|
from pdf_loader import extract_text_from_pdf |
|
|
txt = extract_text_from_pdf(pdf_path) |
|
|
if txt and len(txt) > 200: |
|
|
return txt, [] |
|
|
except Exception: |
|
|
txt = '' |
|
|
|
|
|
pages = pdf_to_images(pdf_path, out_dir='./temp_pages') |
|
|
ocr_texts = [] |
|
|
for p in pages: |
|
|
ocr_texts.append(ocr_image(p)) |
|
|
full = '\n\n'.join(ocr_texts) |
|
|
return full, ocr_texts |
|
|
|