Spaces:

AhmedEwis
/

CMP_AI_RAG

Sleeping

App Files Files Community

CMP_AI_RAG / enhanced_pdf_processor.py

AhmedEwis

Upload 17 files

7ce3a9e verified 3 months ago

raw

history blame contribute delete

12.6 kB

	#!/usr/bin/env python3
	"""
	Enhanced PDF processor with OCR support for image-based PDFs.
	Handles both text-based and image-based (scanned) PDFs.
	"""

	import sys
	import os
	import traceback
	from pathlib import Path
	from typing import List, Dict, Optional, Union, Tuple
	import asyncio
	import logging

	def test_ocr_availability():
	"""Test if OCR libraries are available."""
	ocr_available = {
	'pytesseract': False,
	'easyocr': False,
	'paddleocr': False
	}

	# Test pytesseract + tesseract
	try:
	import pytesseract
	from PIL import Image

	# Try to get tesseract version to verify installation
	version = pytesseract.get_tesseract_version()
	ocr_available['pytesseract'] = True
	print(f" Pytesseract available - Tesseract version: {version}")
	except Exception as e:
	print(f" Pytesseract not available: {e}")

	# Test EasyOCR
	try:
	import easyocr
	ocr_available['easyocr'] = True
	print(f" EasyOCR available")
	except Exception as e:
	print(f" EasyOCR not available: {e}")

	# Test PaddleOCR
	try:
	import paddleocr
	ocr_available['paddleocr'] = True
	print(f" PaddleOCR available")
	except Exception as e:
	print(f" PaddleOCR not available: {e}")

	return ocr_available


	def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]:
	"""Extract text using PyMuPDF for images + pytesseract for OCR."""
	try:
	import fitz
	import pytesseract
	from PIL import Image
	import io

	print(f" Using PyMuPDF + pytesseract OCR")

	doc = fitz.open(str(file_path))
	pages = []

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# First try regular text extraction
	text = page.get_text()

	# If no text found, try OCR on page images
	if not text.strip():
	# Get page as image
	mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
	pix = page.get_pixmap(matrix=mat)

	# Convert to PIL Image
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data))

	# Perform OCR with Arabic support
	try:
	# Configure for Arabic + English
	text = pytesseract.image_to_string(
	image,
	lang='ara+eng',
	config='--oem 3 --psm 6'
	)
	print(f" OCR extracted {len(text)} characters from page {page_num + 1}")
	except Exception as ocr_error:
	print(f" OCR with Arabic failed: {ocr_error}")
	# Try with just English
	try:
	text = pytesseract.image_to_string(
	image,
	lang='eng',
	config='--oem 3 --psm 6'
	)
	print(f" OCR (English only) extracted {len(text)} characters from page {page_num + 1}")
	except Exception as eng_error:
	print(f" OCR completely failed: {eng_error}")
	text = ""

	pages.append({
	'content': text,
	'page_number': page_num + 1,
	'extraction_method': 'pymupdf_ocr',
	'metadata': {
	'page_size': page.rect,
	'rotation': page.rotation,
	'used_ocr': len(text.strip()) > 0
	}
	})

	doc.close()
	return pages

	except Exception as e:
	raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}")


	def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]:
	"""Extract text using EasyOCR (good for Arabic)."""
	try:
	import fitz
	import easyocr
	import numpy as np
	from PIL import Image
	import io

	print(f" Using EasyOCR")

	# Initialize EasyOCR reader with Arabic and English
	reader = easyocr.Reader(['ar', 'en'], gpu=False) # Use CPU

	doc = fitz.open(str(file_path))
	pages = []

	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Get page as image with high resolution
	mat = fitz.Matrix(3.0, 3.0) # 3x zoom for better OCR
	pix = page.get_pixmap(matrix=mat)

	# Convert to numpy array for EasyOCR
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data))
	img_array = np.array(image)

	# Perform OCR
	results = reader.readtext(img_array)

	# Combine all text
	text_parts = []
	for (bbox, text, confidence) in results:
	if confidence > 0.3: # Filter low confidence results
	text_parts.append(text)

	text = ' '.join(text_parts)
	print(f" EasyOCR extracted {len(text)} characters from page {page_num + 1}")

	pages.append({
	'content': text,
	'page_number': page_num + 1,
	'extraction_method': 'easyocr',
	'metadata': {
	'page_size': page.rect,
	'rotation': page.rotation,
	'ocr_results': len(results),
	'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0
	}
	})

	doc.close()
	return pages

	except Exception as e:
	raise Exception(f"EasyOCR extraction failed: {str(e)}")


	def create_ocr_fix_script():
	"""Create a script to install OCR libraries."""
	ocr_install_script = """#!/bin/bash
	# OCR Libraries Installation Script

	echo "Installing OCR libraries for scanned PDF processing..."

	# Install Tesseract (Windows using conda/pip)
	echo "1. Installing Tesseract OCR..."
	# For Windows with conda:
	# conda install -c conda-forge tesseract
	# For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki

	# Install Python OCR libraries
	echo "2. Installing Python OCR libraries..."
	pip install pytesseract easyocr

	# Arabic language data for Tesseract
	echo "3. Installing Arabic language support..."
	# Tesseract Arabic data should be downloaded automatically
	# Manual download: https://github.com/tesseract-ocr/tessdata

	echo "OCR installation complete!"
	echo ""
	echo "To test OCR availability, run:"
	echo "python enhanced_pdf_processor.py --test-ocr"
	"""

	with open("install_ocr.sh", 'w') as f:
	f.write(ocr_install_script)

	print("Created install_ocr.sh script for OCR library installation")


	def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]:
	"""
	Extract text using multiple methods with intelligent fallback.
	Returns (pages, method_used)
	"""

	# Test what OCR libraries are available
	ocr_available = test_ocr_availability()

	# Method 1: Try regular extraction first
	try:
	import fitz
	doc = fitz.open(str(file_path))

	# Check if document has extractable text
	has_text = False
	for page_num in range(min(3, len(doc))): # Check first 3 pages
	page = doc.load_page(page_num)
	text = page.get_text()
	if text.strip():
	has_text = True
	break

	doc.close()

	if has_text:
	print(" Document has extractable text, using regular extraction")
	# Use existing extraction methods (PyMuPDF or pdfplumber)
	doc = fitz.open(str(file_path))
	pages = []
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text = page.get_text()
	pages.append({
	'content': text,
	'page_number': page_num + 1,
	'extraction_method': 'pymupdf_regular',
	'metadata': {'page_size': page.rect, 'rotation': page.rotation}
	})
	doc.close()
	return pages, 'pymupdf_regular'

	except Exception as e:
	print(f" Regular extraction failed: {e}")

	# Method 2: OCR methods for image-based PDFs
	print(" Document appears to be image-based, trying OCR methods...")

	# Try EasyOCR (best for Arabic)
	if ocr_available['easyocr']:
	try:
	pages = extract_with_ocr_easyocr(file_path)
	total_chars = sum(len(page['content']) for page in pages)
	if total_chars > 50: # Minimum reasonable content
	return pages, 'easyocr'
	except Exception as e:
	print(f" EasyOCR failed: {e}")

	# Try pytesseract
	if ocr_available['pytesseract']:
	try:
	pages = extract_with_ocr_pytesseract(file_path)
	total_chars = sum(len(page['content']) for page in pages)
	if total_chars > 50:
	return pages, 'pytesseract_ocr'
	except Exception as e:
	print(f" Pytesseract OCR failed: {e}")

	# If no OCR available, provide instructions
	if not any(ocr_available.values()):
	print("\n ERROR: No OCR libraries available!")
	print(" This PDF contains only images and requires OCR processing.")
	print(" To enable OCR support, install one of the following:")
	print(" 1. pip install pytesseract + Install Tesseract OCR")
	print(" 2. pip install easyocr (recommended for Arabic)")
	print(" 3. Run: ./install_ocr.sh (installation script)")

	create_ocr_fix_script()
	raise Exception("OCR libraries required for image-based PDF")

	raise Exception("All extraction and OCR methods failed")


	def main():
	if len(sys.argv) < 2:
	print("Usage:")
	print(" python enhanced_pdf_processor.py <pdf_file>")
	print(" python enhanced_pdf_processor.py --test-ocr")
	print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'")
	sys.exit(1)

	if sys.argv[1] == '--test-ocr':
	print("Testing OCR library availability:")
	print("=" * 40)
	ocr_available = test_ocr_availability()

	if any(ocr_available.values()):
	print(f"\nOCR Status: READY")
	for lib, available in ocr_available.items():
	status = "Available" if available else "Not Available"
	print(f" {lib}: {status}")
	else:
	print(f"\nOCR Status: NOT READY")
	print("Run install_ocr.sh to install OCR libraries")
	return

	pdf_file = sys.argv[1]
	print("Enhanced PDF Processor with OCR Support")
	print(f"Target file: {pdf_file}")
	print("=" * 60)

	try:
	file_path = Path(pdf_file)
	if not file_path.exists():
	print(f"ERROR: File not found: {pdf_file}")
	return

	# Extract text with intelligent fallback
	pages, method_used = extract_text_with_fallback(file_path)

	# Results
	total_chars = sum(len(page['content']) for page in pages)

	print(f"\n" + "=" * 60)
	print(f"EXTRACTION RESULTS")
	print(f"=" * 60)
	print(f"Method used: {method_used}")
	print(f"Total pages: {len(pages)}")
	print(f"Total characters: {total_chars}")

	if total_chars > 0:
	print(f"\nSUCCESS: Text extracted successfully!")

	# Show sample content
	for page in pages[:2]: # Show first 2 pages
	content = page['content'].strip()
	if content:
	print(f"\nPage {page['page_number']} (first 200 chars):")
	print(f" {content[:200]}...")
	else:
	print(f"\nWARNING: No text could be extracted")

	except Exception as e:
	print(f"\nERROR: {e}")
	print(f"\nFull error traceback:")
	traceback.print_exc()


	if __name__ == "__main__":
	main()