File size: 12,564 Bytes
7ce3a9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
#!/usr/bin/env python3
"""
Enhanced PDF processor with OCR support for image-based PDFs.
Handles both text-based and image-based (scanned) PDFs.
"""

import sys
import os
import traceback
from pathlib import Path
from typing import List, Dict, Optional, Union, Tuple
import asyncio
import logging

def test_ocr_availability():
    """Test if OCR libraries are available."""
    ocr_available = {
        'pytesseract': False,
        'easyocr': False,
        'paddleocr': False
    }
    
    # Test pytesseract + tesseract
    try:
        import pytesseract
        from PIL import Image
        
        # Try to get tesseract version to verify installation
        version = pytesseract.get_tesseract_version()
        ocr_available['pytesseract'] = True
        print(f"  Pytesseract available - Tesseract version: {version}")
    except Exception as e:
        print(f"  Pytesseract not available: {e}")
    
    # Test EasyOCR
    try:
        import easyocr
        ocr_available['easyocr'] = True
        print(f"  EasyOCR available")
    except Exception as e:
        print(f"  EasyOCR not available: {e}")
    
    # Test PaddleOCR
    try:
        import paddleocr
        ocr_available['paddleocr'] = True
        print(f"  PaddleOCR available")
    except Exception as e:
        print(f"  PaddleOCR not available: {e}")
    
    return ocr_available


def extract_with_ocr_pytesseract(file_path: Path) -> List[Dict[str, any]]:
    """Extract text using PyMuPDF for images + pytesseract for OCR."""
    try:
        import fitz
        import pytesseract
        from PIL import Image
        import io
        
        print(f"  Using PyMuPDF + pytesseract OCR")
        
        doc = fitz.open(str(file_path))
        pages = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            
            # First try regular text extraction
            text = page.get_text()
            
            # If no text found, try OCR on page images
            if not text.strip():
                # Get page as image
                mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
                pix = page.get_pixmap(matrix=mat)
                
                # Convert to PIL Image
                img_data = pix.tobytes("png")
                image = Image.open(io.BytesIO(img_data))
                
                # Perform OCR with Arabic support
                try:
                    # Configure for Arabic + English
                    text = pytesseract.image_to_string(
                        image, 
                        lang='ara+eng',
                        config='--oem 3 --psm 6'
                    )
                    print(f"    OCR extracted {len(text)} characters from page {page_num + 1}")
                except Exception as ocr_error:
                    print(f"    OCR with Arabic failed: {ocr_error}")
                    # Try with just English
                    try:
                        text = pytesseract.image_to_string(
                            image,
                            lang='eng',
                            config='--oem 3 --psm 6'
                        )
                        print(f"    OCR (English only) extracted {len(text)} characters from page {page_num + 1}")
                    except Exception as eng_error:
                        print(f"    OCR completely failed: {eng_error}")
                        text = ""
            
            pages.append({
                'content': text,
                'page_number': page_num + 1,
                'extraction_method': 'pymupdf_ocr',
                'metadata': {
                    'page_size': page.rect,
                    'rotation': page.rotation,
                    'used_ocr': len(text.strip()) > 0
                }
            })
        
        doc.close()
        return pages
        
    except Exception as e:
        raise Exception(f"PyMuPDF + OCR extraction failed: {str(e)}")


def extract_with_ocr_easyocr(file_path: Path) -> List[Dict[str, any]]:
    """Extract text using EasyOCR (good for Arabic)."""
    try:
        import fitz
        import easyocr
        import numpy as np
        from PIL import Image
        import io
        
        print(f"  Using EasyOCR")
        
        # Initialize EasyOCR reader with Arabic and English
        reader = easyocr.Reader(['ar', 'en'], gpu=False)  # Use CPU
        
        doc = fitz.open(str(file_path))
        pages = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            
            # Get page as image with high resolution
            mat = fitz.Matrix(3.0, 3.0)  # 3x zoom for better OCR
            pix = page.get_pixmap(matrix=mat)
            
            # Convert to numpy array for EasyOCR
            img_data = pix.tobytes("png")
            image = Image.open(io.BytesIO(img_data))
            img_array = np.array(image)
            
            # Perform OCR
            results = reader.readtext(img_array)
            
            # Combine all text
            text_parts = []
            for (bbox, text, confidence) in results:
                if confidence > 0.3:  # Filter low confidence results
                    text_parts.append(text)
            
            text = ' '.join(text_parts)
            print(f"    EasyOCR extracted {len(text)} characters from page {page_num + 1}")
            
            pages.append({
                'content': text,
                'page_number': page_num + 1,
                'extraction_method': 'easyocr',
                'metadata': {
                    'page_size': page.rect,
                    'rotation': page.rotation,
                    'ocr_results': len(results),
                    'average_confidence': sum(conf for _, _, conf in results) / len(results) if results else 0
                }
            })
        
        doc.close()
        return pages
        
    except Exception as e:
        raise Exception(f"EasyOCR extraction failed: {str(e)}")


def create_ocr_fix_script():
    """Create a script to install OCR libraries."""
    ocr_install_script = """#!/bin/bash
# OCR Libraries Installation Script

echo "Installing OCR libraries for scanned PDF processing..."

# Install Tesseract (Windows using conda/pip)
echo "1. Installing Tesseract OCR..."
# For Windows with conda:
# conda install -c conda-forge tesseract
# For Windows manually: Download from https://github.com/UB-Mannheim/tesseract/wiki

# Install Python OCR libraries
echo "2. Installing Python OCR libraries..."
pip install pytesseract easyocr

# Arabic language data for Tesseract
echo "3. Installing Arabic language support..."
# Tesseract Arabic data should be downloaded automatically
# Manual download: https://github.com/tesseract-ocr/tessdata

echo "OCR installation complete!"
echo ""
echo "To test OCR availability, run:"
echo "python enhanced_pdf_processor.py --test-ocr"
"""
    
    with open("install_ocr.sh", 'w') as f:
        f.write(ocr_install_script)
    
    print("Created install_ocr.sh script for OCR library installation")


def extract_text_with_fallback(file_path: Path) -> Tuple[List[Dict[str, any]], str]:
    """
    Extract text using multiple methods with intelligent fallback.
    Returns (pages, method_used)
    """
    
    # Test what OCR libraries are available
    ocr_available = test_ocr_availability()
    
    # Method 1: Try regular extraction first
    try:
        import fitz
        doc = fitz.open(str(file_path))
        
        # Check if document has extractable text
        has_text = False
        for page_num in range(min(3, len(doc))):  # Check first 3 pages
            page = doc.load_page(page_num)
            text = page.get_text()
            if text.strip():
                has_text = True
                break
        
        doc.close()
        
        if has_text:
            print("  Document has extractable text, using regular extraction")
            # Use existing extraction methods (PyMuPDF or pdfplumber)
            doc = fitz.open(str(file_path))
            pages = []
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text()
                pages.append({
                    'content': text,
                    'page_number': page_num + 1,
                    'extraction_method': 'pymupdf_regular',
                    'metadata': {'page_size': page.rect, 'rotation': page.rotation}
                })
            doc.close()
            return pages, 'pymupdf_regular'
        
    except Exception as e:
        print(f"  Regular extraction failed: {e}")
    
    # Method 2: OCR methods for image-based PDFs
    print("  Document appears to be image-based, trying OCR methods...")
    
    # Try EasyOCR (best for Arabic)
    if ocr_available['easyocr']:
        try:
            pages = extract_with_ocr_easyocr(file_path)
            total_chars = sum(len(page['content']) for page in pages)
            if total_chars > 50:  # Minimum reasonable content
                return pages, 'easyocr'
        except Exception as e:
            print(f"  EasyOCR failed: {e}")
    
    # Try pytesseract
    if ocr_available['pytesseract']:
        try:
            pages = extract_with_ocr_pytesseract(file_path)
            total_chars = sum(len(page['content']) for page in pages)
            if total_chars > 50:
                return pages, 'pytesseract_ocr'
        except Exception as e:
            print(f"  Pytesseract OCR failed: {e}")
    
    # If no OCR available, provide instructions
    if not any(ocr_available.values()):
        print("\n  ERROR: No OCR libraries available!")
        print("  This PDF contains only images and requires OCR processing.")
        print("  To enable OCR support, install one of the following:")
        print("    1. pip install pytesseract + Install Tesseract OCR")
        print("    2. pip install easyocr (recommended for Arabic)")
        print("    3. Run: ./install_ocr.sh (installation script)")
        
        create_ocr_fix_script()
        raise Exception("OCR libraries required for image-based PDF")
    
    raise Exception("All extraction and OCR methods failed")


def main():
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python enhanced_pdf_processor.py <pdf_file>")
        print("  python enhanced_pdf_processor.py --test-ocr")
        print("Example: python enhanced_pdf_processor.py 'path/to/scanned.pdf'")
        sys.exit(1)
    
    if sys.argv[1] == '--test-ocr':
        print("Testing OCR library availability:")
        print("=" * 40)
        ocr_available = test_ocr_availability()
        
        if any(ocr_available.values()):
            print(f"\nOCR Status: READY")
            for lib, available in ocr_available.items():
                status = "Available" if available else "Not Available"
                print(f"  {lib}: {status}")
        else:
            print(f"\nOCR Status: NOT READY")
            print("Run install_ocr.sh to install OCR libraries")
        return
    
    pdf_file = sys.argv[1]
    print("Enhanced PDF Processor with OCR Support")
    print(f"Target file: {pdf_file}")
    print("=" * 60)
    
    try:
        file_path = Path(pdf_file)
        if not file_path.exists():
            print(f"ERROR: File not found: {pdf_file}")
            return
        
        # Extract text with intelligent fallback
        pages, method_used = extract_text_with_fallback(file_path)
        
        # Results
        total_chars = sum(len(page['content']) for page in pages)
        
        print(f"\n" + "=" * 60)
        print(f"EXTRACTION RESULTS")
        print(f"=" * 60)
        print(f"Method used: {method_used}")
        print(f"Total pages: {len(pages)}")
        print(f"Total characters: {total_chars}")
        
        if total_chars > 0:
            print(f"\nSUCCESS: Text extracted successfully!")
            
            # Show sample content
            for page in pages[:2]:  # Show first 2 pages
                content = page['content'].strip()
                if content:
                    print(f"\nPage {page['page_number']} (first 200 chars):")
                    print(f"  {content[:200]}...")
        else:
            print(f"\nWARNING: No text could be extracted")
    
    except Exception as e:
        print(f"\nERROR: {e}")
        print(f"\nFull error traceback:")
        traceback.print_exc()


if __name__ == "__main__":
    main()