#!/usr/bin/env python3 import os import streamlit as st from ocr_processing import process_file # Mock a file upload class MockFile: def __init__(self, name, content): self.name = name self._content = content def getvalue(self): return self._content def test_image(image_path): """Test OCR processing for a specific image""" print(f"\n\n===== Testing {os.path.basename(image_path)} =====") # Load the test image with open(image_path, 'rb') as f: file_bytes = f.read() # Create mock file uploaded_file = MockFile(os.path.basename(image_path), file_bytes) # Process the file result = process_file(uploaded_file) # Display results summary print("\nOCR Content Keys:") for key in result['ocr_contents'].keys(): print(f"- {key}") # Show a preview of raw_text if 'raw_text' in result['ocr_contents']: raw_text = result['ocr_contents']['raw_text'] preview = raw_text[:100] + "..." if len(raw_text) > 100 else raw_text print(f"\nRaw Text Preview: {preview}") # Check for duplicated content found_duplicated = False if 'raw_text' in result['ocr_contents']: raw_text = result['ocr_contents']['raw_text'] # Check if the same text appears twice in sequence (a sign of duplication) if len(raw_text) > 50: half_point = len(raw_text) // 2 first_quarter = raw_text[:half_point//2].strip() if first_quarter and len(first_quarter) > 20: if first_quarter in raw_text[half_point:]: found_duplicated = True print("\n⚠️ WARNING: Possible text duplication detected!") if not found_duplicated: print("\n✅ No text duplication detected") return result def main(): # Test with different image types test_files = [ 'input/magician-or-bottle-cungerer.jpg', # The problematic file 'input/recipe.jpg', # Simple text file 'input/handwritten-letter.jpg' # Mixed content ] for image_path in test_files: test_image(image_path) if __name__ == "__main__": main()