Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Enhanced PDF processor with OCR support for image-based PDFs. | |
| Handles both text-based and image-based (scanned) PDFs. | |
| """ | |
| import sys | |
| import os | |
| import traceback | |
| from pathlib import Path | |
| from typing import List, Dict, Optional, Union, Tuple | |
| import asyncio | |
| import logging | |
| def test_ocr_availability(): | |
| """Test if OCR libraries are available.""" | |
| ocr_available = { | |
| 'pytesseract': False, | |
| 'easyocr': False, | |
| 'paddleocr': False | |
| } | |
| # Test pytesseract + tesseract | |
| try: | |
| import pytesseract | |
| from PIL import Image | |
| # Try to get tesseract version to verify installation | |
| version = pytesseract.get_tesseract_version() | |
| ocr_available['pytesseract'] = True | |
| print(f" Pytesseract available - Tesseract version: {version}") | |
| except Exception as e: | |
| print(f" Pytesseract not available: {e}") | |
| # Test EasyOCR | |
| try: | |
| import easyocr | |
| ocr_available['easyocr'] = True | |
| print(f" EasyOCR available") | |
| except Exception as e: | |
| print(f" EasyOCR not available: {e}") | |
| # Test PaddleOCR | |
| try: | |
| import paddleocr | |
| ocr_available['paddleocr'] = True | |
| print(f" PaddleOCR available") | |
| except Exception as e: | |
| print(f" PaddleOCR not available: {e}") | |
| return ocr_available | |
| def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]: | |
| """Extract text using PyMuPDF for images + pytesseract for OCR.""" | |
| try: | |
| import fitz | |
| import pytesseract | |
| from PIL import Image | |
| import io | |
| print(f" Using PyMuPDF + pytesseract OCR") | |
| doc = fitz.open(str(file_path)) | |
| pages = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| # First try regular text extraction | |
| text = page.get_text() | |
| # If no text found, try OCR on page images | |
| if not text.strip(): | |
| # Get page as image | |
| mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR | |
| pix = page.get_pixmap(matrix=mat) | |
| # Convert to PIL Image | |
| img_data = pix.tobytes("png") | |
| image = Image.open(io.BytesIO(img_data)) | |
| # Perform OCR with Arabic support | |
| try: | |
| # Configure for Arabic + English | |
| text = pytesseract.image_to_string( | |
| image, | |
| lang='ara+eng', | |
| config='--oem 3 --psm 6' | |
| ) | |
| print(f" OCR extracted {len(text)} characters from page {page_num + 1}") | |
| except Exception as ocr_error: | |
| print(f" OCR with Arabic failed: {ocr_error}") | |
| # Try with just English | |
| try: | |
| text = pytesseract.image_to_string( | |
| image, | |
| lang='eng', | |
| config='--oem 3 --psm 6' | |
| ) | |
| print(f" OCR (English only) extracted {len(text)} characters from page {page_num + 1}") | |
| except Exception as eng_error: | |
| print(f" OCR completely failed: {eng_error}") | |
| text = "" | |
| pages.append({ | |
| 'content': text, | |
| 'page_number': page_num + 1, | |
| 'extraction_method': 'pymupdf_ocr', | |
| 'metadata': { | |
| 'page_size': page.rect, | |
| 'rotation': page.rotation, | |
| 'used_ocr': len(text.strip()) > 0 | |
| } | |
| }) | |
| doc.close() | |
| return pages | |
| except Exception as e: | |
| raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}") | |
| def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]: | |
| """Extract text using EasyOCR (good for Arabic).""" | |
| try: | |
| import fitz | |
| import easyocr | |
| import numpy as np | |
| from PIL import Image | |
| import io | |
| print(f" Using EasyOCR") | |
| # Initialize EasyOCR reader with Arabic and English | |
| reader = easyocr.Reader(['ar', 'en'], gpu=False) # Use CPU | |
| doc = fitz.open(str(file_path)) | |
| pages = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| # Get page as image with high resolution | |
| mat = fitz.Matrix(3.0, 3.0) # 3x zoom for better OCR | |
| pix = page.get_pixmap(matrix=mat) | |
| # Convert to numpy array for EasyOCR | |
| img_data = pix.tobytes("png") | |
| image = Image.open(io.BytesIO(img_data)) | |
| img_array = np.array(image) | |
| # Perform OCR | |
| results = reader.readtext(img_array) | |
| # Combine all text | |
| text_parts = [] | |
| for (bbox, text, confidence) in results: | |
| if confidence > 0.3: # Filter low confidence results | |
| text_parts.append(text) | |
| text = ' '.join(text_parts) | |
| print(f" EasyOCR extracted {len(text)} characters from page {page_num + 1}") | |
| pages.append({ | |
| 'content': text, | |
| 'page_number': page_num + 1, | |
| 'extraction_method': 'easyocr', | |
| 'metadata': { | |
| 'page_size': page.rect, | |
| 'rotation': page.rotation, | |
| 'ocr_results': len(results), | |
| 'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0 | |
| } | |
| }) | |
| doc.close() | |
| return pages | |
| except Exception as e: | |
| raise Exception(f"EasyOCR extraction failed: {str(e)}") | |
| def create_ocr_fix_script(): | |
| """Create a script to install OCR libraries.""" | |
| ocr_install_script = """#!/bin/bash | |
| # OCR Libraries Installation Script | |
| echo "Installing OCR libraries for scanned PDF processing..." | |
| # Install Tesseract (Windows using conda/pip) | |
| echo "1. Installing Tesseract OCR..." | |
| # For Windows with conda: | |
| # conda install -c conda-forge tesseract | |
| # For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki | |
| # Install Python OCR libraries | |
| echo "2. Installing Python OCR libraries..." | |
| pip install pytesseract easyocr | |
| # Arabic language data for Tesseract | |
| echo "3. Installing Arabic language support..." | |
| # Tesseract Arabic data should be downloaded automatically | |
| # Manual download: https://github.com/tesseract-ocr/tessdata | |
| echo "OCR installation complete!" | |
| echo "" | |
| echo "To test OCR availability, run:" | |
| echo "python enhanced_pdf_processor.py --test-ocr" | |
| """ | |
| with open("install_ocr.sh", 'w') as f: | |
| f.write(ocr_install_script) | |
| print("Created install_ocr.sh script for OCR library installation") | |
| def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]: | |
| """ | |
| Extract text using multiple methods with intelligent fallback. | |
| Returns (pages, method_used) | |
| """ | |
| # Test what OCR libraries are available | |
| ocr_available = test_ocr_availability() | |
| # Method 1: Try regular extraction first | |
| try: | |
| import fitz | |
| doc = fitz.open(str(file_path)) | |
| # Check if document has extractable text | |
| has_text = False | |
| for page_num in range(min(3, len(doc))): # Check first 3 pages | |
| page = doc.load_page(page_num) | |
| text = page.get_text() | |
| if text.strip(): | |
| has_text = True | |
| break | |
| doc.close() | |
| if has_text: | |
| print(" Document has extractable text, using regular extraction") | |
| # Use existing extraction methods (PyMuPDF or pdfplumber) | |
| doc = fitz.open(str(file_path)) | |
| pages = [] | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| text = page.get_text() | |
| pages.append({ | |
| 'content': text, | |
| 'page_number': page_num + 1, | |
| 'extraction_method': 'pymupdf_regular', | |
| 'metadata': {'page_size': page.rect, 'rotation': page.rotation} | |
| }) | |
| doc.close() | |
| return pages, 'pymupdf_regular' | |
| except Exception as e: | |
| print(f" Regular extraction failed: {e}") | |
| # Method 2: OCR methods for image-based PDFs | |
| print(" Document appears to be image-based, trying OCR methods...") | |
| # Try EasyOCR (best for Arabic) | |
| if ocr_available['easyocr']: | |
| try: | |
| pages = extract_with_ocr_easyocr(file_path) | |
| total_chars = sum(len(page['content']) for page in pages) | |
| if total_chars > 50: # Minimum reasonable content | |
| return pages, 'easyocr' | |
| except Exception as e: | |
| print(f" EasyOCR failed: {e}") | |
| # Try pytesseract | |
| if ocr_available['pytesseract']: | |
| try: | |
| pages = extract_with_ocr_pytesseract(file_path) | |
| total_chars = sum(len(page['content']) for page in pages) | |
| if total_chars > 50: | |
| return pages, 'pytesseract_ocr' | |
| except Exception as e: | |
| print(f" Pytesseract OCR failed: {e}") | |
| # If no OCR available, provide instructions | |
| if not any(ocr_available.values()): | |
| print("\n ERROR: No OCR libraries available!") | |
| print(" This PDF contains only images and requires OCR processing.") | |
| print(" To enable OCR support, install one of the following:") | |
| print(" 1. pip install pytesseract + Install Tesseract OCR") | |
| print(" 2. pip install easyocr (recommended for Arabic)") | |
| print(" 3. Run: ./install_ocr.sh (installation script)") | |
| create_ocr_fix_script() | |
| raise Exception("OCR libraries required for image-based PDF") | |
| raise Exception("All extraction and OCR methods failed") | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage:") | |
| print(" python enhanced_pdf_processor.py <pdf_file>") | |
| print(" python enhanced_pdf_processor.py --test-ocr") | |
| print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'") | |
| sys.exit(1) | |
| if sys.argv[1] == '--test-ocr': | |
| print("Testing OCR library availability:") | |
| print("=" * 40) | |
| ocr_available = test_ocr_availability() | |
| if any(ocr_available.values()): | |
| print(f"\nOCR Status: READY") | |
| for lib, available in ocr_available.items(): | |
| status = "Available" if available else "Not Available" | |
| print(f" {lib}: {status}") | |
| else: | |
| print(f"\nOCR Status: NOT READY") | |
| print("Run install_ocr.sh to install OCR libraries") | |
| return | |
| pdf_file = sys.argv[1] | |
| print("Enhanced PDF Processor with OCR Support") | |
| print(f"Target file: {pdf_file}") | |
| print("=" * 60) | |
| try: | |
| file_path = Path(pdf_file) | |
| if not file_path.exists(): | |
| print(f"ERROR: File not found: {pdf_file}") | |
| return | |
| # Extract text with intelligent fallback | |
| pages, method_used = extract_text_with_fallback(file_path) | |
| # Results | |
| total_chars = sum(len(page['content']) for page in pages) | |
| print(f"\n" + "=" * 60) | |
| print(f"EXTRACTION RESULTS") | |
| print(f"=" * 60) | |
| print(f"Method used: {method_used}") | |
| print(f"Total pages: {len(pages)}") | |
| print(f"Total characters: {total_chars}") | |
| if total_chars > 0: | |
| print(f"\nSUCCESS: Text extracted successfully!") | |
| # Show sample content | |
| for page in pages[:2]: # Show first 2 pages | |
| content = page['content'].strip() | |
| if content: | |
| print(f"\nPage {page['page_number']} (first 200 chars):") | |
| print(f" {content[:200]}...") | |
| else: | |
| print(f"\nWARNING: No text could be extracted") | |
| except Exception as e: | |
| print(f"\nERROR: {e}") | |
| print(f"\nFull error traceback:") | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| main() |