CMP_AI_RAG / enhanced_pdf_processor.py
AhmedEwis's picture
Upload 17 files
7ce3a9e verified
#!/usr/bin/env python3
"""
Enhanced PDF processor with OCR support for image-based PDFs.
Handles both text-based and image-based (scanned) PDFs.
"""
import sys
import os
import traceback
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
import asyncio
import logging
def test_ocr_availability():
"""Test if OCR libraries are available."""
ocr_available = {
'pytesseract': False,
'easyocr': False,
'paddleocr': False
}
# Test pytesseract + tesseract
try:
import pytesseract
from PIL import Image
# Try to get tesseract version to verify installation
version = pytesseract.get_tesseract_version()
ocr_available['pytesseract'] = True
print(f" Pytesseract available - Tesseract version: {version}")
except Exception as e:
print(f" Pytesseract not available: {e}")
# Test EasyOCR
try:
import easyocr
ocr_available['easyocr'] = True
print(f" EasyOCR available")
except Exception as e:
print(f" EasyOCR not available: {e}")
# Test PaddleOCR
try:
import paddleocr
ocr_available['paddleocr'] = True
print(f" PaddleOCR available")
except Exception as e:
print(f" PaddleOCR not available: {e}")
return ocr_available
def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]:
"""Extract text using PyMuPDF for images + pytesseract for OCR."""
try:
import fitz
import pytesseract
from PIL import Image
import io
print(f" Using PyMuPDF + pytesseract OCR")
doc = fitz.open(str(file_path))
pages = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# First try regular text extraction
text = page.get_text()
# If no text found, try OCR on page images
if not text.strip():
# Get page as image
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
# Perform OCR with Arabic support
try:
# Configure for Arabic + English
text = pytesseract.image_to_string(
image,
lang='ara+eng',
config='--oem 3 --psm 6'
)
print(f" OCR extracted {len(text)} characters from page {page_num + 1}")
except Exception as ocr_error:
print(f" OCR with Arabic failed: {ocr_error}")
# Try with just English
try:
text = pytesseract.image_to_string(
image,
lang='eng',
config='--oem 3 --psm 6'
)
print(f" OCR (English only) extracted {len(text)} characters from page {page_num + 1}")
except Exception as eng_error:
print(f" OCR completely failed: {eng_error}")
text = ""
pages.append({
'content': text,
'page_number': page_num + 1,
'extraction_method': 'pymupdf_ocr',
'metadata': {
'page_size': page.rect,
'rotation': page.rotation,
'used_ocr': len(text.strip()) > 0
}
})
doc.close()
return pages
except Exception as e:
raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}")
def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]:
"""Extract text using EasyOCR (good for Arabic)."""
try:
import fitz
import easyocr
import numpy as np
from PIL import Image
import io
print(f" Using EasyOCR")
# Initialize EasyOCR reader with Arabic and English
reader = easyocr.Reader(['ar', 'en'], gpu=False) # Use CPU
doc = fitz.open(str(file_path))
pages = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Get page as image with high resolution
mat = fitz.Matrix(3.0, 3.0) # 3x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
# Convert to numpy array for EasyOCR
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
img_array = np.array(image)
# Perform OCR
results = reader.readtext(img_array)
# Combine all text
text_parts = []
for (bbox, text, confidence) in results:
if confidence > 0.3: # Filter low confidence results
text_parts.append(text)
text = ' '.join(text_parts)
print(f" EasyOCR extracted {len(text)} characters from page {page_num + 1}")
pages.append({
'content': text,
'page_number': page_num + 1,
'extraction_method': 'easyocr',
'metadata': {
'page_size': page.rect,
'rotation': page.rotation,
'ocr_results': len(results),
'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0
}
})
doc.close()
return pages
except Exception as e:
raise Exception(f"EasyOCR extraction failed: {str(e)}")
def create_ocr_fix_script():
"""Create a script to install OCR libraries."""
ocr_install_script = """#!/bin/bash
# OCR Libraries Installation Script
echo "Installing OCR libraries for scanned PDF processing..."
# Install Tesseract (Windows using conda/pip)
echo "1. Installing Tesseract OCR..."
# For Windows with conda:
# conda install -c conda-forge tesseract
# For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki
# Install Python OCR libraries
echo "2. Installing Python OCR libraries..."
pip install pytesseract easyocr
# Arabic language data for Tesseract
echo "3. Installing Arabic language support..."
# Tesseract Arabic data should be downloaded automatically
# Manual download: https://github.com/tesseract-ocr/tessdata
echo "OCR installation complete!"
echo ""
echo "To test OCR availability, run:"
echo "python enhanced_pdf_processor.py --test-ocr"
"""
with open("install_ocr.sh", 'w') as f:
f.write(ocr_install_script)
print("Created install_ocr.sh script for OCR library installation")
def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]:
"""
Extract text using multiple methods with intelligent fallback.
Returns (pages, method_used)
"""
# Test what OCR libraries are available
ocr_available = test_ocr_availability()
# Method 1: Try regular extraction first
try:
import fitz
doc = fitz.open(str(file_path))
# Check if document has extractable text
has_text = False
for page_num in range(min(3, len(doc))): # Check first 3 pages
page = doc.load_page(page_num)
text = page.get_text()
if text.strip():
has_text = True
break
doc.close()
if has_text:
print(" Document has extractable text, using regular extraction")
# Use existing extraction methods (PyMuPDF or pdfplumber)
doc = fitz.open(str(file_path))
pages = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text()
pages.append({
'content': text,
'page_number': page_num + 1,
'extraction_method': 'pymupdf_regular',
'metadata': {'page_size': page.rect, 'rotation': page.rotation}
})
doc.close()
return pages, 'pymupdf_regular'
except Exception as e:
print(f" Regular extraction failed: {e}")
# Method 2: OCR methods for image-based PDFs
print(" Document appears to be image-based, trying OCR methods...")
# Try EasyOCR (best for Arabic)
if ocr_available['easyocr']:
try:
pages = extract_with_ocr_easyocr(file_path)
total_chars = sum(len(page['content']) for page in pages)
if total_chars > 50: # Minimum reasonable content
return pages, 'easyocr'
except Exception as e:
print(f" EasyOCR failed: {e}")
# Try pytesseract
if ocr_available['pytesseract']:
try:
pages = extract_with_ocr_pytesseract(file_path)
total_chars = sum(len(page['content']) for page in pages)
if total_chars > 50:
return pages, 'pytesseract_ocr'
except Exception as e:
print(f" Pytesseract OCR failed: {e}")
# If no OCR available, provide instructions
if not any(ocr_available.values()):
print("\n ERROR: No OCR libraries available!")
print(" This PDF contains only images and requires OCR processing.")
print(" To enable OCR support, install one of the following:")
print(" 1. pip install pytesseract + Install Tesseract OCR")
print(" 2. pip install easyocr (recommended for Arabic)")
print(" 3. Run: ./install_ocr.sh (installation script)")
create_ocr_fix_script()
raise Exception("OCR libraries required for image-based PDF")
raise Exception("All extraction and OCR methods failed")
def main():
if len(sys.argv) < 2:
print("Usage:")
print(" python enhanced_pdf_processor.py <pdf_file>")
print(" python enhanced_pdf_processor.py --test-ocr")
print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'")
sys.exit(1)
if sys.argv[1] == '--test-ocr':
print("Testing OCR library availability:")
print("=" * 40)
ocr_available = test_ocr_availability()
if any(ocr_available.values()):
print(f"\nOCR Status: READY")
for lib, available in ocr_available.items():
status = "Available" if available else "Not Available"
print(f" {lib}: {status}")
else:
print(f"\nOCR Status: NOT READY")
print("Run install_ocr.sh to install OCR libraries")
return
pdf_file = sys.argv[1]
print("Enhanced PDF Processor with OCR Support")
print(f"Target file: {pdf_file}")
print("=" * 60)
try:
file_path = Path(pdf_file)
if not file_path.exists():
print(f"ERROR: File not found: {pdf_file}")
return
# Extract text with intelligent fallback
pages, method_used = extract_text_with_fallback(file_path)
# Results
total_chars = sum(len(page['content']) for page in pages)
print(f"\n" + "=" * 60)
print(f"EXTRACTION RESULTS")
print(f"=" * 60)
print(f"Method used: {method_used}")
print(f"Total pages: {len(pages)}")
print(f"Total characters: {total_chars}")
if total_chars > 0:
print(f"\nSUCCESS: Text extracted successfully!")
# Show sample content
for page in pages[:2]: # Show first 2 pages
content = page['content'].strip()
if content:
print(f"\nPage {page['page_number']} (first 200 chars):")
print(f" {content[:200]}...")
else:
print(f"\nWARNING: No text could be extracted")
except Exception as e:
print(f"\nERROR: {e}")
print(f"\nFull error traceback:")
traceback.print_exc()
if __name__ == "__main__":
main()