Spaces:

AhmedEwis
/

CMP_AI_RAG

Sleeping

File size: 12,564 Bytes

7ce3a9e

#!/usr/bin/env python3
"""
Enhanced PDF processor with OCR support for image-based PDFs.
Handles both text-based and image-based (scanned) PDFs.
"""

import sys
import os
import traceback
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
import asyncio
import logging

def test_ocr_availability():
    """Test if OCR libraries are available."""
    ocr_available = {
        'pytesseract': False,
        'easyocr': False,
        'paddleocr': False
    }
    
    # Test pytesseract + tesseract
    try:
        import pytesseract
        from PIL import Image
        
        # Try to get tesseract version to verify installation
        version = pytesseract.get_tesseract_version()
        ocr_available['pytesseract'] = True
        print(f"  Pytesseract available - Tesseract version: {version}")
    except Exception as e:
        print(f"  Pytesseract not available: {e}")
    
    # Test EasyOCR
    try:
        import easyocr
        ocr_available['easyocr'] = True
        print(f"  EasyOCR available")
    except Exception as e:
        print(f"  EasyOCR not available: {e}")
    
    # Test PaddleOCR
    try:
        import paddleocr
        ocr_available['paddleocr'] = True
        print(f"  PaddleOCR available")
    except Exception as e:
        print(f"  PaddleOCR not available: {e}")
    
    return ocr_available


def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]:
    """Extract text using PyMuPDF for images + pytesseract for OCR."""
    try:
        import fitz
        import pytesseract
        from PIL import Image
        import io
        
        print(f"  Using PyMuPDF + pytesseract OCR")
        
        doc = fitz.open(str(file_path))
        pages = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            
            # First try regular text extraction
            text = page.get_text()
            
            # If no text found, try OCR on page images
            if not text.strip():
                # Get page as image
                mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
                pix = page.get_pixmap(matrix=mat)
                
                # Convert to PIL Image
                img_data = pix.tobytes("png")
                image = Image.open(io.BytesIO(img_data))
                
                # Perform OCR with Arabic support
                try:
                    # Configure for Arabic + English
                    text = pytesseract.image_to_string(
                        image, 
                        lang='ara+eng',
                        config='--oem 3 --psm 6'
                    )
                    print(f"    OCR extracted {len(text)} characters from page {page_num + 1}")
                except Exception as ocr_error:
                    print(f"    OCR with Arabic failed: {ocr_error}")
                    # Try with just English
                    try:
                        text = pytesseract.image_to_string(
                            image,
                            lang='eng',
                            config='--oem 3 --psm 6'
                        )
                        print(f"    OCR (English only) extracted {len(text)} characters from page {page_num + 1}")
                    except Exception as eng_error:
                        print(f"    OCR completely failed: {eng_error}")
                        text = ""
            
            pages.append({
                'content': text,
                'page_number': page_num + 1,
                'extraction_method': 'pymupdf_ocr',
                'metadata': {
                    'page_size': page.rect,
                    'rotation': page.rotation,
                    'used_ocr': len(text.strip()) > 0
                }
            })
        
        doc.close()
        return pages
        
    except Exception as e:
        raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}")


def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]:
    """Extract text using EasyOCR (good for Arabic)."""
    try:
        import fitz
        import easyocr
        import numpy as np
        from PIL import Image
        import io
        
        print(f"  Using EasyOCR")
        
        # Initialize EasyOCR reader with Arabic and English
        reader = easyocr.Reader(['ar', 'en'], gpu=False)  # Use CPU
        
        doc = fitz.open(str(file_path))
        pages = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            
            # Get page as image with high resolution
            mat = fitz.Matrix(3.0, 3.0)  # 3x zoom for better OCR
            pix = page.get_pixmap(matrix=mat)
            
            # Convert to numpy array for EasyOCR
            img_data = pix.tobytes("png")
            image = Image.open(io.BytesIO(img_data))
            img_array = np.array(image)
            
            # Perform OCR
            results = reader.readtext(img_array)
            
            # Combine all text
            text_parts = []
            for (bbox, text, confidence) in results:
                if confidence > 0.3:  # Filter low confidence results
                    text_parts.append(text)
            
            text = ' '.join(text_parts)
            print(f"    EasyOCR extracted {len(text)} characters from page {page_num + 1}")
            
            pages.append({
                'content': text,
                'page_number': page_num + 1,
                'extraction_method': 'easyocr',
                'metadata': {
                    'page_size': page.rect,
                    'rotation': page.rotation,
                    'ocr_results': len(results),
                    'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0
                }
            })
        
        doc.close()
        return pages
        
    except Exception as e:
        raise Exception(f"EasyOCR extraction failed: {str(e)}")


def create_ocr_fix_script():
    """Create a script to install OCR libraries."""
    ocr_install_script = """#!/bin/bash
# OCR Libraries Installation Script

echo "Installing OCR libraries for scanned PDF processing..."

# Install Tesseract (Windows using conda/pip)
echo "1. Installing Tesseract OCR..."
# For Windows with conda:
# conda install -c conda-forge tesseract
# For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki

# Install Python OCR libraries
echo "2. Installing Python OCR libraries..."
pip install pytesseract easyocr

# Arabic language data for Tesseract
echo "3. Installing Arabic language support..."
# Tesseract Arabic data should be downloaded automatically
# Manual download: https://github.com/tesseract-ocr/tessdata

echo "OCR installation complete!"
echo ""
echo "To test OCR availability, run:"
echo "python enhanced_pdf_processor.py --test-ocr"
"""
    
    with open("install_ocr.sh", 'w') as f:
        f.write(ocr_install_script)
    
    print("Created install_ocr.sh script for OCR library installation")


def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]:
    """
    Extract text using multiple methods with intelligent fallback.
    Returns (pages, method_used)
    """
    
    # Test what OCR libraries are available
    ocr_available = test_ocr_availability()
    
    # Method 1: Try regular extraction first
    try:
        import fitz
        doc = fitz.open(str(file_path))
        
        # Check if document has extractable text
        has_text = False
        for page_num in range(min(3, len(doc))):  # Check first 3 pages
            page = doc.load_page(page_num)
            text = page.get_text()
            if text.strip():
                has_text = True
                break
        
        doc.close()
        
        if has_text:
            print("  Document has extractable text, using regular extraction")
            # Use existing extraction methods (PyMuPDF or pdfplumber)
            doc = fitz.open(str(file_path))
            pages = []
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text()
                pages.append({
                    'content': text,
                    'page_number': page_num + 1,
                    'extraction_method': 'pymupdf_regular',
                    'metadata': {'page_size': page.rect, 'rotation': page.rotation}
                })
            doc.close()
            return pages, 'pymupdf_regular'
        
    except Exception as e:
        print(f"  Regular extraction failed: {e}")
    
    # Method 2: OCR methods for image-based PDFs
    print("  Document appears to be image-based, trying OCR methods...")
    
    # Try EasyOCR (best for Arabic)
    if ocr_available['easyocr']:
        try:
            pages = extract_with_ocr_easyocr(file_path)
            total_chars = sum(len(page['content']) for page in pages)
            if total_chars > 50:  # Minimum reasonable content
                return pages, 'easyocr'
        except Exception as e:
            print(f"  EasyOCR failed: {e}")
    
    # Try pytesseract
    if ocr_available['pytesseract']:
        try:
            pages = extract_with_ocr_pytesseract(file_path)
            total_chars = sum(len(page['content']) for page in pages)
            if total_chars > 50:
                return pages, 'pytesseract_ocr'
        except Exception as e:
            print(f"  Pytesseract OCR failed: {e}")
    
    # If no OCR available, provide instructions
    if not any(ocr_available.values()):
        print("\n  ERROR: No OCR libraries available!")
        print("  This PDF contains only images and requires OCR processing.")
        print("  To enable OCR support, install one of the following:")
        print("    1. pip install pytesseract + Install Tesseract OCR")
        print("    2. pip install easyocr (recommended for Arabic)")
        print("    3. Run: ./install_ocr.sh (installation script)")
        
        create_ocr_fix_script()
        raise Exception("OCR libraries required for image-based PDF")
    
    raise Exception("All extraction and OCR methods failed")


def main():
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python enhanced_pdf_processor.py <pdf_file>")
        print("  python enhanced_pdf_processor.py --test-ocr")
        print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'")
        sys.exit(1)
    
    if sys.argv[1] == '--test-ocr':
        print("Testing OCR library availability:")
        print("=" * 40)
        ocr_available = test_ocr_availability()
        
        if any(ocr_available.values()):
            print(f"\nOCR Status: READY")
            for lib, available in ocr_available.items():
                status = "Available" if available else "Not Available"
                print(f"  {lib}: {status}")
        else:
            print(f"\nOCR Status: NOT READY")
            print("Run install_ocr.sh to install OCR libraries")
        return
    
    pdf_file = sys.argv[1]
    print("Enhanced PDF Processor with OCR Support")
    print(f"Target file: {pdf_file}")
    print("=" * 60)
    
    try:
        file_path = Path(pdf_file)
        if not file_path.exists():
            print(f"ERROR: File not found: {pdf_file}")
            return
        
        # Extract text with intelligent fallback
        pages, method_used = extract_text_with_fallback(file_path)
        
        # Results
        total_chars = sum(len(page['content']) for page in pages)
        
        print(f"\n" + "=" * 60)
        print(f"EXTRACTION RESULTS")
        print(f"=" * 60)
        print(f"Method used: {method_used}")
        print(f"Total pages: {len(pages)}")
        print(f"Total characters: {total_chars}")
        
        if total_chars > 0:
            print(f"\nSUCCESS: Text extracted successfully!")
            
            # Show sample content
            for page in pages[:2]:  # Show first 2 pages
                content = page['content'].strip()
                if content:
                    print(f"\nPage {page['page_number']} (first 200 chars):")
                    print(f"  {content[:200]}...")
        else:
            print(f"\nWARNING: No text could be extracted")
    
    except Exception as e:
        print(f"\nERROR: {e}")
        print(f"\nFull error traceback:")
        traceback.print_exc()


if __name__ == "__main__":
    main()