Spaces:
Running
Running
| """ | |
| Utility function for processing files with OCR in the Historical OCR Workshop app. | |
| """ | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime | |
| def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=None): | |
| """Process the uploaded file and return the OCR results | |
| Args: | |
| uploaded_file: The uploaded file to process | |
| use_vision: Whether to use vision model | |
| processor: StructuredOCR processor (if None, it will be imported) | |
| custom_prompt: Optional additional instructions for the model | |
| Returns: | |
| dict: The OCR results | |
| """ | |
| # Import the processor if not provided | |
| if processor is None: | |
| from structured_ocr import StructuredOCR | |
| processor = StructuredOCR() | |
| # Save the uploaded file to a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp: | |
| tmp.write(uploaded_file.getvalue()) | |
| temp_path = tmp.name | |
| try: | |
| # Determine file type from extension | |
| file_ext = Path(uploaded_file.name).suffix.lower() | |
| file_type = "pdf" if file_ext == ".pdf" else "image" | |
| # Get file size in MB | |
| file_size_mb = os.path.getsize(temp_path) / (1024 * 1024) | |
| # Process the file with file size information for automatic page limiting | |
| result = processor.process_file( | |
| temp_path, | |
| file_type=file_type, | |
| use_vision=use_vision, | |
| file_size_mb=file_size_mb, | |
| custom_prompt=custom_prompt | |
| ) | |
| # Add processing metadata | |
| result.update({ | |
| "file_name": uploaded_file.name, | |
| "processed_at": datetime.now().isoformat(), | |
| "file_size_mb": round(file_size_mb, 2), | |
| "use_vision": use_vision | |
| }) | |
| return result | |
| except Exception as e: | |
| return { | |
| "error": str(e), | |
| "file_name": uploaded_file.name | |
| } | |
| finally: | |
| # Clean up the temporary file | |
| if os.path.exists(temp_path): | |
| os.unlink(temp_path) | |