File size: 2,293 Bytes
3fe84f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import asyncio
from pathlib import Path
from PIL import Image
import pytesseract
import easyocr
import io


async def extract_text_ocr(file_path: str, language: str = 'en') -> dict:
    """
    Extract text from image using OCR
    
    Args:
        file_path: Path to image file
        language: Language code for OCR
        
    Returns:
        Dict with extracted text and confidence
    """
    try:
        # Try EasyOCR first (better quality)
        reader = easyocr.Reader([language], gpu=False)
        result = reader.readtext(file_path)
        
        text_lines = [detection[1] for detection in result]
        confidences = [detection[2] for detection in result]
        
        full_text = '\n'.join(text_lines)
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        
        return {
            'text': full_text,
            'confidence': avg_confidence,
            'line_count': len(text_lines),
            'method': 'easyocr'
        }
        
    except Exception as e:
        # Fallback to Tesseract
        try:
            img = Image.open(file_path)
            text = pytesseract.image_to_string(img, lang=language)
            
            return {
                'text': text,
                'confidence': 0.8,  # Estimated
                'line_count': len(text.split('\n')),
                'method': 'tesseract'
            }
        except Exception as e2:
            return {
                'text': '',
                'error': f"OCR failed: {str(e)}, {str(e2)}",
                'confidence': 0
            }


async def extract_text_from_pdf_image(pdf_path: str, page_num: int = 0) -> dict:
    """Extract text from scanned PDF using OCR"""
    try:
        from pdf2image import convert_from_path
        
        images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
        
        if not images:
            return {'text': '', 'error': 'No pages found'}
        
        # Save temp image
        temp_path = f"/tmp/page_{page_num}.png"
        images[0].save(temp_path, 'PNG')
        
        # Extract text
        result = await extract_text_ocr(temp_path)
        
        return result
        
    except Exception as e:
        return {'text': '', 'error': str(e)}