Spaces:
Running
Running
| # Standard library imports | |
| import os | |
| import logging | |
| from pathlib import Path | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def is_likely_letterhead(file_path, features=None): | |
| """ | |
| Determine if a document is likely to contain letterhead or marginalia | |
| Args: | |
| file_path: Path to the document image | |
| features: Optional dictionary of pre-extracted features like text density | |
| Returns: | |
| bool: True if the document likely contains letterhead, False otherwise | |
| """ | |
| # Simple logic based on filename for initial version | |
| file_name = Path(file_path).name.lower() | |
| letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo'] | |
| # Check filename for indicators | |
| for indicator in letterhead_indicators: | |
| if indicator in file_name: | |
| logger.info(f"Letterhead detected based on filename: {file_name}") | |
| return True | |
| # Check features if provided | |
| if features: | |
| # High text density at the top of the document may indicate letterhead | |
| if 'top_density' in features and features['top_density'] > 0.5: | |
| logger.info(f"Letterhead detected based on top text density: {features['top_density']}") | |
| return True | |
| # Uneven text distribution may indicate marginalia | |
| if 'density_variance' in features and features['density_variance'] > 0.3: | |
| logger.info(f"Possible marginalia detected based on text density variance") | |
| return True | |
| # Default to standard document | |
| return False | |
| def get_letterhead_prompt(file_path, features=None): | |
| """ | |
| Generate a specialized prompt for letterhead document OCR | |
| Args: | |
| file_path: Path to the document image | |
| features: Optional dictionary of pre-extracted features | |
| Returns: | |
| str: Specialized prompt for letterhead document OCR | |
| """ | |
| # Base prompt for all letterhead documents | |
| base_prompt = ("This document appears to be a letter or includes letterhead elements. " | |
| "Please extract the following components separately if present:\n" | |
| "1. Letterhead (header with logo, organization name, address, etc.)\n" | |
| "2. Date\n" | |
| "3. Recipient information (address, name, title)\n" | |
| "4. Salutation (e.g., 'Dear Sir/Madam')\n" | |
| "5. Main body text\n" | |
| "6. Closing (e.g., 'Sincerely')\n" | |
| "7. Signature\n" | |
| "8. Any footnotes, marginalia, or annotations\n\n" | |
| "Preserve the original formatting and structure as much as possible.") | |
| # Enhanced prompts based on features | |
| if features: | |
| # Extract additional context from features if available | |
| if 'is_historical' in features and features['is_historical']: | |
| base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older " | |
| "letterhead styles, formal language patterns, and period-specific formatting.") | |
| if 'has_marginalia' in features and features['has_marginalia']: | |
| base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. " | |
| "Please extract these separately from the main text and indicate their position.") | |
| return base_prompt | |