Spaces:
Running
Running
| import asyncio | |
| from pathlib import Path | |
| from PIL import Image | |
| import pytesseract | |
| import easyocr | |
| import io | |
| async def extract_text_ocr(file_path: str, language: str = 'en') -> dict: | |
| """ | |
| Extract text from image using OCR | |
| Args: | |
| file_path: Path to image file | |
| language: Language code for OCR | |
| Returns: | |
| Dict with extracted text and confidence | |
| """ | |
| try: | |
| # Try EasyOCR first (better quality) | |
| reader = easyocr.Reader([language], gpu=False) | |
| result = reader.readtext(file_path) | |
| text_lines = [detection[1] for detection in result] | |
| confidences = [detection[2] for detection in result] | |
| full_text = '\n'.join(text_lines) | |
| avg_confidence = sum(confidences) / len(confidences) if confidences else 0 | |
| return { | |
| 'text': full_text, | |
| 'confidence': avg_confidence, | |
| 'line_count': len(text_lines), | |
| 'method': 'easyocr' | |
| } | |
| except Exception as e: | |
| # Fallback to Tesseract | |
| try: | |
| img = Image.open(file_path) | |
| text = pytesseract.image_to_string(img, lang=language) | |
| return { | |
| 'text': text, | |
| 'confidence': 0.8, # Estimated | |
| 'line_count': len(text.split('\n')), | |
| 'method': 'tesseract' | |
| } | |
| except Exception as e2: | |
| return { | |
| 'text': '', | |
| 'error': f"OCR failed: {str(e)}, {str(e2)}", | |
| 'confidence': 0 | |
| } | |
| async def extract_text_from_pdf_image(pdf_path: str, page_num: int = 0) -> dict: | |
| """Extract text from scanned PDF using OCR""" | |
| try: | |
| from pdf2image import convert_from_path | |
| images = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1) | |
| if not images: | |
| return {'text': '', 'error': 'No pages found'} | |
| # Save temp image | |
| temp_path = f"/tmp/page_{page_num}.png" | |
| images[0].save(temp_path, 'PNG') | |
| # Extract text | |
| result = await extract_text_ocr(temp_path) | |
| return result | |
| except Exception as e: | |
| return {'text': '', 'error': str(e)} | |