Spaces:
Running
Running
| """ | |
| File utility functions for historical OCR processing. | |
| """ | |
| import base64 | |
| import logging | |
| from pathlib import Path | |
| # Configure logging | |
| logger = logging.getLogger("utils") | |
| logger.setLevel(logging.INFO) | |
| def get_base64_from_image(image_path): | |
| """ | |
| Get base64 data URL from image file with proper MIME type. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| Base64 data URL with appropriate MIME type prefix | |
| """ | |
| try: | |
| # Convert to Path object for better handling | |
| path_obj = Path(image_path) | |
| # Determine mime type based on file extension | |
| mime_type = 'image/jpeg' # Default mime type | |
| suffix = path_obj.suffix.lower() | |
| if suffix == '.png': | |
| mime_type = 'image/png' | |
| elif suffix == '.gif': | |
| mime_type = 'image/gif' | |
| elif suffix in ['.jpg', '.jpeg']: | |
| mime_type = 'image/jpeg' | |
| elif suffix == '.pdf': | |
| mime_type = 'application/pdf' | |
| # Read and encode file | |
| with open(path_obj, "rb") as file: | |
| encoded = base64.b64encode(file.read()).decode('utf-8') | |
| return f"data:{mime_type};base64,{encoded}" | |
| except Exception as e: | |
| logger.error(f"Error encoding file to base64: {str(e)}") | |
| return "" | |
| def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None): | |
| """ | |
| Get base64 data URL from file bytes with proper MIME type. | |
| Args: | |
| file_bytes: Binary file data | |
| mime_type: MIME type of the file (optional) | |
| file_name: Original file name for MIME type detection (optional) | |
| Returns: | |
| Base64 data URL with appropriate MIME type prefix | |
| """ | |
| try: | |
| # Determine mime type if not provided | |
| if mime_type is None and file_name is not None: | |
| # Get file extension | |
| suffix = Path(file_name).suffix.lower() | |
| if suffix == '.png': | |
| mime_type = 'image/png' | |
| elif suffix == '.gif': | |
| mime_type = 'image/gif' | |
| elif suffix in ['.jpg', '.jpeg']: | |
| mime_type = 'image/jpeg' | |
| elif suffix == '.pdf': | |
| mime_type = 'application/pdf' | |
| else: | |
| # Default to image/jpeg for unknown types when processing images | |
| mime_type = 'image/jpeg' | |
| elif mime_type is None: | |
| # Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream | |
| # to ensure compatibility with Mistral AI OCR API | |
| mime_type = 'image/jpeg' | |
| # Encode and create data URL | |
| encoded = base64.b64encode(file_bytes).decode('utf-8') | |
| return f"data:{mime_type};base64,{encoded}" | |
| except Exception as e: | |
| logger.error(f"Error encoding bytes to base64: {str(e)}") | |
| return "" | |
| def handle_temp_files(temp_file_paths): | |
| """ | |
| Clean up temporary files | |
| Args: | |
| temp_file_paths: List of temporary file paths to clean up | |
| """ | |
| import os | |
| for temp_path in temp_file_paths: | |
| try: | |
| if os.path.exists(temp_path): | |
| os.unlink(temp_path) | |
| logger.info(f"Removed temporary file: {temp_path}") | |
| except Exception as e: | |
| logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}") | |