yolo_layoutlm / app.py
aagamjtdev's picture
app.py
bbc2086
raw
history blame
4.99 kB
import gradio as gr
import json
import os
import tempfile
from pathlib import Path
# Import your pipeline function
from working_yolo_pipeline import run_document_pipeline, DEFAULT_LAYOUTLMV3_MODEL_PATH, WEIGHTS_PATH
def process_pdf(pdf_file, layoutlmv3_model_path=None):
"""
Wrapper function for Gradio interface.
Args:
pdf_file: Gradio UploadButton file object
layoutlmv3_model_path: Optional custom model path
Returns:
Tuple of (JSON string, download file path)
"""
if pdf_file is None:
return "❌ Error: No PDF file uploaded.", None
# Use default model path if not provided
if not layoutlmv3_model_path:
layoutlmv3_model_path = DEFAULT_LAYOUTLMV3_MODEL_PATH
# Verify model and weights exist
if not os.path.exists(layoutlmv3_model_path):
return f"❌ Error: LayoutLMv3 model not found at {layoutlmv3_model_path}", None
if not os.path.exists(WEIGHTS_PATH):
return f"❌ Error: YOLO weights not found at {WEIGHTS_PATH}", None
try:
# Get the uploaded PDF path
pdf_path = pdf_file.name
# Run the pipeline
result = run_document_pipeline(pdf_path, layoutlmv3_model_path)
if result is None:
return "❌ Error: Pipeline failed to process the PDF. Check console for details.", None
# Create a temporary file for download
output_filename = f"{Path(pdf_path).stem}_analysis.json"
temp_output = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json', prefix='analysis_')
with open(temp_output.name, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
# Format JSON for display
json_display = json.dumps(result, indent=2, ensure_ascii=False)
# Truncate if too long for display
if len(json_display) > 50000:
json_display = json_display[:50000] + "\n\n... (truncated for display, download full file)"
return json_display, temp_output.name
except Exception as e:
return f"❌ Error during processing: {str(e)}", None
# Create Gradio interface
with gr.Blocks(title="Document Analysis Pipeline", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 📄 Document Analysis Pipeline
Upload a PDF document to extract structured data including questions, options, answers, passages, and embedded images.
**Pipeline Steps:**
1. 🔍 YOLO/OCR Preprocessing (word extraction + figure/equation detection)
2. 🤖 LayoutLMv3 Inference (BIO tagging)
3. 📊 Structured JSON Decoding
4. 🖼️ Base64 Image Embedding
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF Document",
file_types=[".pdf"],
type="filepath"
)
model_path_input = gr.Textbox(
label="LayoutLMv3 Model Path (optional)",
placeholder=DEFAULT_LAYOUTLMV3_MODEL_PATH,
value=DEFAULT_LAYOUTLMV3_MODEL_PATH,
interactive=True
)
process_btn = gr.Button("🚀 Process Document", variant="primary", size="lg")
gr.Markdown("""
### ℹ️ Notes:
- Processing may take several minutes depending on PDF size
- Figures and equations will be extracted and embedded as Base64
- The output JSON includes structured questions, options, and answers
""")
with gr.Column(scale=2):
json_output = gr.Code(
label="Structured JSON Output",
language="json",
lines=25
)
download_output = gr.File(
label="Download Full JSON",
interactive=False
)
# Status/Examples section
with gr.Row():
gr.Markdown("""
### 📋 Output Format
The pipeline generates JSON with the following structure:
- **Questions**: Extracted question text
- **Options**: Multiple choice options (A, B, C, D, etc.)
- **Answers**: Correct answer(s)
- **Passages**: Associated reading passages
- **Images**: Base64-encoded figures and equations (embedded with keys like `figure1`, `equation2`)
""")
# Connect the button to the processing function
process_btn.click(
fn=process_pdf,
inputs=[pdf_input, model_path_input],
outputs=[json_output, download_output],
api_name="process_document"
)
# Example section (optional - add example PDFs if available)
# gr.Examples(
# examples=[
# ["examples/sample1.pdf"],
# ["examples/sample2.pdf"],
# ],
# inputs=pdf_input,
# )
# Launch the app
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
)