Spaces:
Running
Running
| """ | |
| OCR utility functions for image processing and OCR operations. | |
| This module provides helper functions used across the Historical OCR application. | |
| """ | |
| import os | |
| import base64 | |
| import logging | |
| from pathlib import Path | |
| from typing import Union, Optional | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Try to import optional dependencies | |
| try: | |
| import pytesseract | |
| TESSERACT_AVAILABLE = True | |
| except ImportError: | |
| logger.warning("pytesseract not available - local OCR fallback will not work") | |
| TESSERACT_AVAILABLE = False | |
| try: | |
| from PIL import Image | |
| PILLOW_AVAILABLE = True | |
| except ImportError: | |
| logger.warning("PIL not available - image preprocessing will be limited") | |
| PILLOW_AVAILABLE = False | |
| def encode_image_for_api(image_path: Union[str, Path]) -> str: | |
| """ | |
| Encode an image as base64 data URL for API submission with proper MIME type. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| Base64 data URL for the image | |
| """ | |
| # Convert to Path object if string | |
| image_file = Path(image_path) if isinstance(image_path, str) else image_path | |
| # Verify image exists | |
| if not image_file.is_file(): | |
| raise FileNotFoundError(f"Image file not found: {image_file}") | |
| # Determine mime type based on file extension | |
| mime_type = 'image/jpeg' # Default mime type | |
| suffix = image_file.suffix.lower() | |
| if suffix == '.png': | |
| mime_type = 'image/png' | |
| elif suffix == '.gif': | |
| mime_type = 'image/gif' | |
| elif suffix in ['.jpg', '.jpeg']: | |
| mime_type = 'image/jpeg' | |
| elif suffix == '.pdf': | |
| mime_type = 'application/pdf' | |
| # Encode image as base64 | |
| encoded = base64.b64encode(image_file.read_bytes()).decode() | |
| return f"data:{mime_type};base64,{encoded}" | |
| def try_local_ocr_fallback(file_path: Union[str, Path], base64_data_url: Optional[str] = None) -> Optional[str]: | |
| """ | |
| Try to perform OCR using local Tesseract as a fallback when the API is unavailable. | |
| Args: | |
| file_path: Path to the image file | |
| base64_data_url: Optional base64 data URL if already available | |
| Returns: | |
| Extracted text or None if extraction failed | |
| """ | |
| if not TESSERACT_AVAILABLE or not PILLOW_AVAILABLE: | |
| logger.warning("Local OCR fallback is not available (missing dependencies)") | |
| return None | |
| try: | |
| logger.info("Using local Tesseract OCR as fallback") | |
| # Use PIL to open the image | |
| img = Image.open(file_path) | |
| # Use Tesseract to extract text | |
| text = pytesseract.image_to_string(img) | |
| if text: | |
| logger.info("Successfully extracted text using local Tesseract OCR") | |
| return text | |
| else: | |
| logger.warning("Tesseract extracted no text") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error using local OCR fallback: {str(e)}") | |
| return None | |