#!/usr/bin/env python3 """ Enhanced PDF processor with OCR support for image-based PDFs. Handles both text-based and image-based (scanned) PDFs. """ import sys import os import traceback from pathlib import Path from typing import List, Dict, Optional, Union, Tuple import asyncio import logging def test_ocr_availability(): """Test if OCR libraries are available.""" ocr_available = { 'pytesseract': False, 'easyocr': False, 'paddleocr': False } # Test pytesseract + tesseract try: import pytesseract from PIL import Image # Try to get tesseract version to verify installation version = pytesseract.get_tesseract_version() ocr_available['pytesseract'] = True print(f" Pytesseract available - Tesseract version: {version}") except Exception as e: print(f" Pytesseract not available: {e}") # Test EasyOCR try: import easyocr ocr_available['easyocr'] = True print(f" EasyOCR available") except Exception as e: print(f" EasyOCR not available: {e}") # Test PaddleOCR try: import paddleocr ocr_available['paddleocr'] = True print(f" PaddleOCR available") except Exception as e: print(f" PaddleOCR not available: {e}") return ocr_available def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]: """Extract text using PyMuPDF for images + pytesseract for OCR.""" try: import fitz import pytesseract from PIL import Image import io print(f" Using PyMuPDF + pytesseract OCR") doc = fitz.open(str(file_path)) pages = [] for page_num in range(len(doc)): page = doc.load_page(page_num) # First try regular text extraction text = page.get_text() # If no text found, try OCR on page images if not text.strip(): # Get page as image mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR pix = page.get_pixmap(matrix=mat) # Convert to PIL Image img_data = pix.tobytes("png") image = Image.open(io.BytesIO(img_data)) # Perform OCR with Arabic support try: # Configure for Arabic + English text = pytesseract.image_to_string( image, lang='ara+eng', config='--oem 3 --psm 6' ) print(f" OCR extracted {len(text)} characters from page {page_num + 1}") except Exception as ocr_error: print(f" OCR with Arabic failed: {ocr_error}") # Try with just English try: text = pytesseract.image_to_string( image, lang='eng', config='--oem 3 --psm 6' ) print(f" OCR (English only) extracted {len(text)} characters from page {page_num + 1}") except Exception as eng_error: print(f" OCR completely failed: {eng_error}") text = "" pages.append({ 'content': text, 'page_number': page_num + 1, 'extraction_method': 'pymupdf_ocr', 'metadata': { 'page_size': page.rect, 'rotation': page.rotation, 'used_ocr': len(text.strip()) > 0 } }) doc.close() return pages except Exception as e: raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}") def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]: """Extract text using EasyOCR (good for Arabic).""" try: import fitz import easyocr import numpy as np from PIL import Image import io print(f" Using EasyOCR") # Initialize EasyOCR reader with Arabic and English reader = easyocr.Reader(['ar', 'en'], gpu=False) # Use CPU doc = fitz.open(str(file_path)) pages = [] for page_num in range(len(doc)): page = doc.load_page(page_num) # Get page as image with high resolution mat = fitz.Matrix(3.0, 3.0) # 3x zoom for better OCR pix = page.get_pixmap(matrix=mat) # Convert to numpy array for EasyOCR img_data = pix.tobytes("png") image = Image.open(io.BytesIO(img_data)) img_array = np.array(image) # Perform OCR results = reader.readtext(img_array) # Combine all text text_parts = [] for (bbox, text, confidence) in results: if confidence > 0.3: # Filter low confidence results text_parts.append(text) text = ' '.join(text_parts) print(f" EasyOCR extracted {len(text)} characters from page {page_num + 1}") pages.append({ 'content': text, 'page_number': page_num + 1, 'extraction_method': 'easyocr', 'metadata': { 'page_size': page.rect, 'rotation': page.rotation, 'ocr_results': len(results), 'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0 } }) doc.close() return pages except Exception as e: raise Exception(f"EasyOCR extraction failed: {str(e)}") def create_ocr_fix_script(): """Create a script to install OCR libraries.""" ocr_install_script = """#!/bin/bash # OCR Libraries Installation Script echo "Installing OCR libraries for scanned PDF processing..." # Install Tesseract (Windows using conda/pip) echo "1. Installing Tesseract OCR..." # For Windows with conda: # conda install -c conda-forge tesseract # For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki # Install Python OCR libraries echo "2. Installing Python OCR libraries..." pip install pytesseract easyocr # Arabic language data for Tesseract echo "3. Installing Arabic language support..." # Tesseract Arabic data should be downloaded automatically # Manual download: https://github.com/tesseract-ocr/tessdata echo "OCR installation complete!" echo "" echo "To test OCR availability, run:" echo "python enhanced_pdf_processor.py --test-ocr" """ with open("install_ocr.sh", 'w') as f: f.write(ocr_install_script) print("Created install_ocr.sh script for OCR library installation") def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]: """ Extract text using multiple methods with intelligent fallback. Returns (pages, method_used) """ # Test what OCR libraries are available ocr_available = test_ocr_availability() # Method 1: Try regular extraction first try: import fitz doc = fitz.open(str(file_path)) # Check if document has extractable text has_text = False for page_num in range(min(3, len(doc))): # Check first 3 pages page = doc.load_page(page_num) text = page.get_text() if text.strip(): has_text = True break doc.close() if has_text: print(" Document has extractable text, using regular extraction") # Use existing extraction methods (PyMuPDF or pdfplumber) doc = fitz.open(str(file_path)) pages = [] for page_num in range(len(doc)): page = doc.load_page(page_num) text = page.get_text() pages.append({ 'content': text, 'page_number': page_num + 1, 'extraction_method': 'pymupdf_regular', 'metadata': {'page_size': page.rect, 'rotation': page.rotation} }) doc.close() return pages, 'pymupdf_regular' except Exception as e: print(f" Regular extraction failed: {e}") # Method 2: OCR methods for image-based PDFs print(" Document appears to be image-based, trying OCR methods...") # Try EasyOCR (best for Arabic) if ocr_available['easyocr']: try: pages = extract_with_ocr_easyocr(file_path) total_chars = sum(len(page['content']) for page in pages) if total_chars > 50: # Minimum reasonable content return pages, 'easyocr' except Exception as e: print(f" EasyOCR failed: {e}") # Try pytesseract if ocr_available['pytesseract']: try: pages = extract_with_ocr_pytesseract(file_path) total_chars = sum(len(page['content']) for page in pages) if total_chars > 50: return pages, 'pytesseract_ocr' except Exception as e: print(f" Pytesseract OCR failed: {e}") # If no OCR available, provide instructions if not any(ocr_available.values()): print("\n ERROR: No OCR libraries available!") print(" This PDF contains only images and requires OCR processing.") print(" To enable OCR support, install one of the following:") print(" 1. pip install pytesseract + Install Tesseract OCR") print(" 2. pip install easyocr (recommended for Arabic)") print(" 3. Run: ./install_ocr.sh (installation script)") create_ocr_fix_script() raise Exception("OCR libraries required for image-based PDF") raise Exception("All extraction and OCR methods failed") def main(): if len(sys.argv) < 2: print("Usage:") print(" python enhanced_pdf_processor.py ") print(" python enhanced_pdf_processor.py --test-ocr") print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'") sys.exit(1) if sys.argv[1] == '--test-ocr': print("Testing OCR library availability:") print("=" * 40) ocr_available = test_ocr_availability() if any(ocr_available.values()): print(f"\nOCR Status: READY") for lib, available in ocr_available.items(): status = "Available" if available else "Not Available" print(f" {lib}: {status}") else: print(f"\nOCR Status: NOT READY") print("Run install_ocr.sh to install OCR libraries") return pdf_file = sys.argv[1] print("Enhanced PDF Processor with OCR Support") print(f"Target file: {pdf_file}") print("=" * 60) try: file_path = Path(pdf_file) if not file_path.exists(): print(f"ERROR: File not found: {pdf_file}") return # Extract text with intelligent fallback pages, method_used = extract_text_with_fallback(file_path) # Results total_chars = sum(len(page['content']) for page in pages) print(f"\n" + "=" * 60) print(f"EXTRACTION RESULTS") print(f"=" * 60) print(f"Method used: {method_used}") print(f"Total pages: {len(pages)}") print(f"Total characters: {total_chars}") if total_chars > 0: print(f"\nSUCCESS: Text extracted successfully!") # Show sample content for page in pages[:2]: # Show first 2 pages content = page['content'].strip() if content: print(f"\nPage {page['page_number']} (first 200 chars):") print(f" {content[:200]}...") else: print(f"\nWARNING: No text could be extracted") except Exception as e: print(f"\nERROR: {e}") print(f"\nFull error traceback:") traceback.print_exc() if __name__ == "__main__": main()