Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import io | |
| import streamlit as st | |
| from PIL import Image, ImageDraw, ImageFont | |
| from google import genai | |
| from google.genai import types | |
| from pdf2image import convert_from_bytes | |
| DETECTION_PROMPT = """\ | |
| Analyze this document image and identify text regions following these rules: | |
| 1. GROUP RELATED CONTENT: | |
| - Full tables as SINGLE regions (including headers and all rows) | |
| - Paragraphs as SINGLE rectangular blocks (multiple lines as one box) | |
| - Keep text columns intact | |
| - Treat list items as single region if visually grouped | |
| 2. TEXT REGION REQUIREMENTS: | |
| - Boundaries must tightly wrap text content | |
| - Include 2% padding around text clusters | |
| - Exclude isolated decorative elements | |
| - Merge adjacent text fragments with ≤1% spacing | |
| 3. COORDINATE FORMAT: | |
| Python list of lists [[xmin, ymin, xmax, ymax]] | |
| - Normalized 0-1 with 3 decimal places | |
| - Ordered top-to-bottom, left-to-right | |
| - Table example: [[0.12, 0.35, 0.88, 0.65]] for full table | |
| 4. SPECIAL CASES: | |
| - Table cells should NOT have individual boxes | |
| - Page headers/footers as separate regions | |
| - Text wrapped around images as distinct regions | |
| Example response for table + 2 paragraphs: | |
| [[0.07, 0.12, 0.93, 0.28], # Header | |
| [0.12, 0.35, 0.88, 0.65], # Full table | |
| [0.10, 0.70, 0.90, 0.85], # First paragraph | |
| [0.10, 0.88, 0.90, 0.95]] # Second paragraph | |
| ONLY RETURN THE PYTHON LIST! No explanations. | |
| """ | |
| TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else." | |
| def parse_list_boxes(text): | |
| """Improved parsing with better error handling""" | |
| try: | |
| return eval(text) | |
| except: | |
| matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text) | |
| return [[float(x) for x in m] for m in matches] | |
| def draw_bounding_boxes(image, boxes): | |
| """Enhanced drawing with numbering""" | |
| if not boxes: | |
| return image | |
| draw = ImageDraw.Draw(image) | |
| width, height = image.size | |
| for i, box in enumerate(boxes): | |
| try: | |
| # Convert normalized coordinates to pixel values | |
| xmin = max(0.0, min(1.0, box[0])) * width | |
| ymin = max(0.0, min(1.0, box[1])) * height | |
| xmax = max(0.0, min(1.0, box[2])) * width | |
| ymax = max(0.0, min(1.0, box[3])) * height | |
| # Draw bounding box | |
| draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3) | |
| # Draw number label | |
| label = str(i+1) | |
| draw.text((xmin + 5, ymin + 5), label, fill="red") | |
| except Exception as e: | |
| st.error(f"Error drawing box: {str(e)}") | |
| return image | |
| def extract_text_from_region(client, image, box): | |
| """Extract text from a specific region using Gemini""" | |
| try: | |
| width, height = image.size | |
| # Convert normalized coordinates to pixel values | |
| xmin = int(max(0.0, min(1.0, box[0])) * width) | |
| ymin = int(max(0.0, min(1.0, box[1])) * height) | |
| xmax = int(max(0.0, min(1.0, box[2])) * width) | |
| ymax = int(max(0.0, min(1.0, box[3])) * height) | |
| if xmin >= xmax or ymin >= ymax: | |
| return "" | |
| # Crop and convert to bytes | |
| cropped = image.crop((xmin, ymin, xmax, ymax)) | |
| img_byte_arr = io.BytesIO() | |
| cropped.save(img_byte_arr, format='PNG') | |
| # Call Gemini API | |
| response = client.models.generate_content( | |
| model="gemini-2.5-pro-exp-03-25", | |
| contents=[ | |
| TEXT_EXTRACTION_PROMPT, | |
| types.Part.from_bytes( | |
| data=img_byte_arr.getvalue(), | |
| mime_type="image/png" | |
| ) | |
| ] | |
| ) | |
| return response.text.strip() | |
| except Exception as e: | |
| st.error(f"Text extraction error: {str(e)}") | |
| return "" | |
| # Streamlit UI | |
| st.title("PDF Text Detection") | |
| uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) | |
| if uploaded_file and st.button("Analyze"): | |
| with st.spinner("Processing..."): | |
| try: | |
| images = convert_from_bytes(uploaded_file.read(), dpi=300) | |
| client = genai.Client(api_key=os.getenv("KEY")) | |
| tabs = st.tabs([f"Page {i+1}" for i in range(len(images))]) | |
| for idx, (tab, image) in enumerate(zip(tabs, images)): | |
| with tab: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.image(image, caption="Original", use_container_width=True) | |
| with col2: | |
| # Get bounding boxes | |
| img_byte_arr = io.BytesIO() | |
| image.save(img_byte_arr, format='PNG') | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash-exp", | |
| contents=[ | |
| DETECTION_PROMPT, | |
| types.Part.from_bytes( | |
| data=img_byte_arr.getvalue(), | |
| mime_type="image/png" | |
| ) | |
| ] | |
| ) | |
| boxes = parse_list_boxes(response.text) | |
| texts = [extract_text_from_region(client, image, box) for box in boxes] | |
| # Draw annotated image | |
| annotated = draw_bounding_boxes(image.copy(), boxes) | |
| st.image(annotated, | |
| caption=f"Detected {len(boxes)} text regions", | |
| use_container_width=True) | |
| # Display extracted texts | |
| if any(texts): | |
| st.subheader("Extracted Texts:") | |
| for i, text in enumerate(texts, 1): | |
| st.write(f"{i}. {text if text else 'No text detected'}") | |
| # Debug section | |
| debug_expander = st.expander("Debug Details") | |
| with debug_expander: | |
| st.write("**Raw API Response:**") | |
| st.code(response.text) | |
| st.write("**Parsed Boxes:**") | |
| st.write(boxes) | |
| except Exception as e: | |
| st.error(f"Error: {str(e)}") |