import asyncio from pathlib import Path from PIL import Image import pytesseract import easyocr import io async def extract_text_ocr(file_path: str, language: str = 'en') -> dict: """ Extract text from image using OCR Args: file_path: Path to image file language: Language code for OCR Returns: Dict with extracted text and confidence """ try: # Try EasyOCR first (better quality) reader = easyocr.Reader([language], gpu=False) result = reader.readtext(file_path) text_lines = [detection[1] for detection in result] confidences = [detection[2] for detection in result] full_text = '\n'.join(text_lines) avg_confidence = sum(confidences) / len(confidences) if confidences else 0 return { 'text': full_text, 'confidence': avg_confidence, 'line_count': len(text_lines), 'method': 'easyocr' } except Exception as e: # Fallback to Tesseract try: img = Image.open(file_path) text = pytesseract.image_to_string(img, lang=language) return { 'text': text, 'confidence': 0.8, # Estimated 'line_count': len(text.split('\n')), 'method': 'tesseract' } except Exception as e2: return { 'text': '', 'error': f"OCR failed: {str(e)}, {str(e2)}", 'confidence': 0 } async def extract_text_from_pdf_image(pdf_path: str, page_num: int = 0) -> dict: """Extract text from scanned PDF using OCR""" try: from pdf2image import convert_from_path images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1) if not images: return {'text': '', 'error': 'No pages found'} # Save temp image temp_path = f"/tmp/page_{page_num}.png" images[0].save(temp_path, 'PNG') # Extract text result = await extract_text_ocr(temp_path) return result except Exception as e: return {'text': '', 'error': str(e)}