Spaces:
Running
Running
| """ | |
| UI utilities for OCR results display. | |
| """ | |
| import os | |
| import streamlit as st | |
| import json | |
| import base64 | |
| import io | |
| from datetime import datetime | |
| from utils.text_utils import format_ocr_text | |
| from utils.content_utils import classify_document_content, format_structured_data | |
| def display_results(result, container, custom_prompt=""): | |
| """Display OCR results in the provided container""" | |
| with container: | |
| # Add heading for document metadata | |
| st.markdown("### Document Metadata") | |
| # Filter out large data structures from metadata display | |
| meta = {k: v for k, v in result.items() | |
| if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']} | |
| # Create a compact metadata section for primary metadata | |
| meta_html = '<div style="display: flex; flex-wrap: wrap; gap: 0.3rem; margin-bottom: 0.3rem;">' | |
| # Document type | |
| if 'detected_document_type' in meta: | |
| meta_html += f'<div><strong>Type:</strong> {meta["detected_document_type"]}</div>' | |
| # Page information | |
| if 'limited_pages' in meta: | |
| meta_html += f'<div><strong>Pages:</strong> {meta["limited_pages"]["processed"]}/{meta["limited_pages"]["total"]}</div>' | |
| meta_html += '</div>' | |
| st.markdown(meta_html, unsafe_allow_html=True) | |
| # Processing time - separate section for proper ordering of all metadata fields | |
| if 'processing_time' in meta: | |
| time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
| time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>' | |
| time_html += f'<div>{meta["processing_time"]:.1f}s</div>' | |
| time_html += '</div>' | |
| st.markdown(time_html, unsafe_allow_html=True) | |
| # Language metadata on a separate line, Subject Tags below | |
| # First show languages if available | |
| if 'languages' in result and result['languages']: | |
| languages = [lang for lang in result['languages'] if lang is not None] | |
| if languages: | |
| # Create a dedicated line for Languages | |
| lang_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
| lang_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Language:</div>' | |
| # Add language tags | |
| for lang in languages: | |
| # Clean language name if needed | |
| clean_lang = str(lang).strip() | |
| if clean_lang: # Only add if not empty | |
| lang_html += f'<span class="subject-tag tag-language">{clean_lang}</span>' | |
| lang_html += '</div>' | |
| st.markdown(lang_html, unsafe_allow_html=True) | |
| # Prepare download files | |
| try: | |
| # Get base filename | |
| from utils.general_utils import create_descriptive_filename | |
| original_file = result.get('file_name', 'document') | |
| base_name = create_descriptive_filename(original_file, result, "") | |
| base_name = os.path.splitext(base_name)[0] | |
| # 1. JSON download - with base64 data truncated for readability | |
| from utils.image_utils import truncate_base64_in_result | |
| truncated_result = truncate_base64_in_result(result) | |
| json_str = json.dumps(truncated_result, indent=2) | |
| json_filename = f"{base_name}.json" | |
| json_b64 = base64.b64encode(json_str.encode()).decode() | |
| # 2. Create ZIP with all files | |
| from utils.image_utils import create_results_zip_in_memory | |
| zip_data = create_results_zip_in_memory(result) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| zip_filename = f"{base_name}_{timestamp}.zip" | |
| zip_b64 = base64.b64encode(zip_data).decode() | |
| # Add download line with metadata styling | |
| download_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
| download_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Download:</div>' | |
| # Download links in order of importance, matching the zip file contents | |
| download_html += f'<a href="data:application/json;base64,{json_b64}" download="{json_filename}" class="subject-tag tag-download">JSON</a>' | |
| # Zip download link (packages everything together) | |
| download_html += f'<a href="data:application/zip;base64,{zip_b64}" download="{zip_filename}" class="subject-tag tag-download">Zip Archive</a>' | |
| download_html += '</div>' | |
| st.markdown(download_html, unsafe_allow_html=True) | |
| except Exception as e: | |
| # Silent fail for downloads - don't disrupt the UI | |
| pass | |
| # Create a separate line for Time if we have time-related tags | |
| if 'topics' in result and result['topics']: | |
| time_tags = [topic for topic in result['topics'] | |
| if any(term in topic.lower() for term in ["century", "pre-", "era"])] | |
| if time_tags: | |
| time_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
| time_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Time:</div>' | |
| for tag in time_tags: | |
| time_html += f'<span class="subject-tag tag-time-period">{tag}</span>' | |
| time_html += '</div>' | |
| st.markdown(time_html, unsafe_allow_html=True) | |
| # Then display remaining subject tags if available | |
| if 'topics' in result and result['topics']: | |
| # Filter out time-related tags which are already displayed | |
| subject_tags = [topic for topic in result['topics'] | |
| if not any(term in topic.lower() for term in ["century", "pre-", "era"])] | |
| if subject_tags: | |
| # Create a separate line for Subject Tags | |
| tags_html = '<div style="display: flex; align-items: center; margin: 0.2rem 0; flex-wrap: wrap;">' | |
| tags_html += '<div style="margin-right: 0.3rem; font-weight: bold;">Subject Tags:</div>' | |
| tags_html += '<div style="display: flex; flex-wrap: wrap; gap: 2px; align-items: center;">' | |
| # Generate a badge for each remaining tag | |
| for topic in subject_tags: | |
| # Determine tag category class | |
| tag_class = "subject-tag" # Default class | |
| # Add specialized class based on category | |
| if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]): | |
| tag_class += " tag-language" # Languages | |
| elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]): | |
| tag_class += " tag-document-type" # Document types | |
| elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]): | |
| tag_class += " tag-subject" # Subject domains | |
| elif "historical" in topic.lower() and "document" in topic.lower(): | |
| tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type | |
| # Add each tag as an inline span | |
| tags_html += f'<span class="{tag_class}">{topic}</span>' | |
| # Close the containers | |
| tags_html += '</div></div>' | |
| # Render the subject tags section | |
| st.markdown(tags_html, unsafe_allow_html=True) | |
| # Check if we have OCR content | |
| if 'ocr_contents' in result: | |
| # Create a single view instead of tabs | |
| content_tab1 = st.container() | |
| # Check for images in the result to use later | |
| has_images = result.get('has_images', False) | |
| has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', []))) | |
| has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and | |
| any('images' in page for page in result['raw_response_data']['pages'] | |
| if isinstance(page, dict))) | |
| # Display structured content | |
| with content_tab1: | |
| # Display structured content with markdown formatting | |
| if isinstance(result['ocr_contents'], dict): | |
| # CSS is now handled in the main layout.py file | |
| # Collect all available images from the result | |
| available_images = [] | |
| if has_images and 'pages_data' in result: | |
| for page_idx, page in enumerate(result['pages_data']): | |
| if 'images' in page and len(page['images']) > 0: | |
| for img_idx, img in enumerate(page['images']): | |
| if 'image_base64' in img: | |
| available_images.append({ | |
| 'source': 'pages_data', | |
| 'page': page_idx, | |
| 'index': img_idx, | |
| 'data': img['image_base64'] | |
| }) | |
| # Get images from raw response as well | |
| if 'raw_response_data' in result: | |
| raw_data = result['raw_response_data'] | |
| if isinstance(raw_data, dict) and 'pages' in raw_data: | |
| for page_idx, page in enumerate(raw_data['pages']): | |
| if isinstance(page, dict) and 'images' in page: | |
| for img_idx, img in enumerate(page['images']): | |
| if isinstance(img, dict) and 'base64' in img: | |
| available_images.append({ | |
| 'source': 'raw_response', | |
| 'page': page_idx, | |
| 'index': img_idx, | |
| 'data': img['base64'] | |
| }) | |
| # Extract images for display at the top | |
| images_to_display = [] | |
| # First, collect all available images | |
| for img_idx, img in enumerate(available_images): | |
| if 'data' in img: | |
| images_to_display.append({ | |
| 'data': img['data'], | |
| 'id': img.get('id', f"img_{img_idx}"), | |
| 'index': img_idx | |
| }) | |
| # Image display now only happens in the Images tab | |
| # Organize sections in a logical order - prioritize main_text | |
| section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"] | |
| ordered_sections = [] | |
| # Add known sections first in preferred order | |
| for section_name in section_order: | |
| if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]: | |
| ordered_sections.append(section_name) | |
| # Add any remaining sections | |
| for section in result['ocr_contents'].keys(): | |
| if (section not in ordered_sections and | |
| section not in ['error', 'partial_text'] and | |
| result['ocr_contents'][section]): | |
| ordered_sections.append(section) | |
| # If only raw_text is available and no other content, add it last | |
| if ('raw_text' in result['ocr_contents'] and | |
| result['ocr_contents']['raw_text'] and | |
| len(ordered_sections) == 0): | |
| ordered_sections.append('raw_text') | |
| # Add minimal spacing before OCR results | |
| st.markdown("<div style='margin: 8px 0 4px 0;'></div>", unsafe_allow_html=True) | |
| # Create tabs for different views | |
| if has_images: | |
| tabs = st.tabs(["Document Content", "Raw JSON", "Images"]) | |
| doc_tab, json_tab, img_tab = tabs | |
| else: | |
| tabs = st.tabs(["Document Content", "Raw JSON"]) | |
| doc_tab, json_tab = tabs | |
| img_tab = None | |
| # Document Content tab with simple, clean formatting that matches markdown export files | |
| with doc_tab: | |
| # Create a single unified content section | |
| st.markdown("## Text Content") | |
| # Present content directly in the format used in markdown export files | |
| if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict): | |
| # Get all content fields that should be displayed | |
| content_fields = {} | |
| # Add all available content fields (left_page, right_page, etc) | |
| for field, content in result['ocr_contents'].items(): | |
| # Skip certain fields that shouldn't be displayed | |
| if field in ['error', 'partial_text'] or not content: | |
| continue | |
| # Clean the content if it's a string | |
| if isinstance(content, str) and content.strip(): | |
| content_fields[field] = content.strip() | |
| # Handle dictionary or list content | |
| elif isinstance(content, (dict, list)): | |
| formatted_content = format_structured_data(content) | |
| if formatted_content: | |
| content_fields[field] = formatted_content | |
| # Process nested dictionary structures | |
| def flatten_content_fields(fields, parent_key=""): | |
| flat_fields = {} | |
| for field, content in fields.items(): | |
| # Skip certain fields | |
| if field in ['error', 'partial_text'] or not content: | |
| continue | |
| # Handle string content | |
| if isinstance(content, str) and content.strip(): | |
| key = f"{parent_key}_{field}".strip("_") | |
| flat_fields[key] = content.strip() | |
| # Handle dictionary content | |
| elif isinstance(content, dict): | |
| # If the dictionary has a 'text' key, extract just that value | |
| if 'text' in content and isinstance(content['text'], str): | |
| key = f"{parent_key}_{field}".strip("_") | |
| flat_fields[key] = content['text'].strip() | |
| # Otherwise, recursively process nested dictionaries | |
| else: | |
| nested_fields = flatten_content_fields(content, f"{parent_key}_{field}") | |
| flat_fields.update(nested_fields) | |
| # Handle list content | |
| elif isinstance(content, list): | |
| formatted_content = format_structured_data(content) | |
| if formatted_content: | |
| key = f"{parent_key}_{field}".strip("_") | |
| flat_fields[key] = formatted_content | |
| return flat_fields | |
| # Flatten the content structure | |
| flat_content_fields = flatten_content_fields(result['ocr_contents']) | |
| # Display the flattened content fields with proper formatting | |
| for field, content in flat_content_fields.items(): | |
| # Skip any empty content | |
| if not content or not content.strip(): | |
| continue | |
| # Format field name as in the markdown export | |
| field_display = field.replace('_', ' ') | |
| # Maintain content purity - don't parse text content as JSON | |
| # Historical text may contain curly braces that aren't JSON | |
| # For raw_text field, display only the content without the field name | |
| if field == 'raw_text': | |
| st.markdown(f"{content}") | |
| else: | |
| # For other fields, display the field name in bold followed by the content | |
| st.markdown(f"**{field}:** {content}") | |
| # Add spacing between fields | |
| st.markdown("\n\n") | |
| # Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button | |
| with json_tab: | |
| # Use the same truncated JSON that's used in the download button | |
| from utils.image_utils import truncate_base64_in_result | |
| truncated_result = truncate_base64_in_result(result) | |
| # Format the JSON prettily | |
| json_str = json.dumps(truncated_result, indent=2) | |
| # Display JSON with a copy button using Streamlit's built-in functionality | |
| st.json(truncated_result) | |
| # Images tab - for viewing document images | |
| if has_images and img_tab: | |
| with img_tab: | |
| # Display each available image | |
| for i, img in enumerate(images_to_display): | |
| st.image(img['data'], caption=f"Image {i+1}", use_container_width=True) | |
| # Display custom prompt if provided | |
| if custom_prompt: | |
| with st.expander("Custom Processing Instructions"): | |
| st.write(custom_prompt) | |