pdf_gemini

Sleeping

App Files Files Community

pdf_gemini / app.py

Sebbe33

Update app.py

b96fa16 verified 8 months ago

raw

history blame contribute delete

6.52 kB

	import os
	import re
	import io
	import streamlit as st
	from PIL import Image, ImageDraw, ImageFont
	from google import genai
	from google.genai import types
	from pdf2image import convert_from_bytes

	DETECTION_PROMPT = """\
	Analyze this document image and identify text regions following these rules:

	1. GROUP RELATED CONTENT:
	- Full tables as SINGLE regions (including headers and all rows)
	- Paragraphs as SINGLE rectangular blocks (multiple lines as one box)
	- Keep text columns intact
	- Treat list items as single region if visually grouped

	2. TEXT REGION REQUIREMENTS:
	- Boundaries must tightly wrap text content
	- Include 2% padding around text clusters
	- Exclude isolated decorative elements
	- Merge adjacent text fragments with ≤1% spacing

	3. COORDINATE FORMAT:
	Python list of lists [[xmin, ymin, xmax, ymax]]
	- Normalized 0-1 with 3 decimal places
	- Ordered top-to-bottom, left-to-right
	- Table example: [[0.12, 0.35, 0.88, 0.65]] for full table

	4. SPECIAL CASES:
	- Table cells should NOT have individual boxes
	- Page headers/footers as separate regions
	- Text wrapped around images as distinct regions

	Example response for table + 2 paragraphs:
	[[0.07, 0.12, 0.93, 0.28], # Header
	[0.12, 0.35, 0.88, 0.65], # Full table
	[0.10, 0.70, 0.90, 0.85], # First paragraph
	[0.10, 0.88, 0.90, 0.95]] # Second paragraph

	ONLY RETURN THE PYTHON LIST! No explanations.
	"""

	TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else."

	def parse_list_boxes(text):
	"""Improved parsing with better error handling"""
	try:
	return eval(text)
	except:
	matches = re.findall(r'\[([\d\.]+),\s([\d\.]+),\s([\d\.]+),\s*([\d\.]+)\]', text)
	return [[float(x) for x in m] for m in matches]

	def draw_bounding_boxes(image, boxes):
	"""Enhanced drawing with numbering"""
	if not boxes:
	return image

	draw = ImageDraw.Draw(image)
	width, height = image.size

	for i, box in enumerate(boxes):
	try:
	# Convert normalized coordinates to pixel values
	xmin = max(0.0, min(1.0, box[0])) * width
	ymin = max(0.0, min(1.0, box[1])) * height
	xmax = max(0.0, min(1.0, box[2])) * width
	ymax = max(0.0, min(1.0, box[3])) * height

	# Draw bounding box
	draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)

	# Draw number label
	label = str(i+1)
	draw.text((xmin + 5, ymin + 5), label, fill="red")
	except Exception as e:
	st.error(f"Error drawing box: {str(e)}")
	return image

	def extract_text_from_region(client, image, box):
	"""Extract text from a specific region using Gemini"""
	try:
	width, height = image.size
	# Convert normalized coordinates to pixel values
	xmin = int(max(0.0, min(1.0, box[0])) * width)
	ymin = int(max(0.0, min(1.0, box[1])) * height)
	xmax = int(max(0.0, min(1.0, box[2])) * width)
	ymax = int(max(0.0, min(1.0, box[3])) * height)

	if xmin >= xmax or ymin >= ymax:
	return ""

	# Crop and convert to bytes
	cropped = image.crop((xmin, ymin, xmax, ymax))
	img_byte_arr = io.BytesIO()
	cropped.save(img_byte_arr, format='PNG')

	# Call Gemini API
	response = client.models.generate_content(
	model="gemini-2.5-pro-exp-03-25",
	contents=[
	TEXT_EXTRACTION_PROMPT,
	types.Part.from_bytes(
	data=img_byte_arr.getvalue(),
	mime_type="image/png"
	)
	]
	)
	return response.text.strip()
	except Exception as e:
	st.error(f"Text extraction error: {str(e)}")
	return ""

	# Streamlit UI
	st.title("PDF Text Detection")
	uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])

	if uploaded_file and st.button("Analyze"):
	with st.spinner("Processing..."):
	try:
	images = convert_from_bytes(uploaded_file.read(), dpi=300)
	client = genai.Client(api_key=os.getenv("KEY"))

	tabs = st.tabs([f"Page {i+1}" for i in range(len(images))])

	for idx, (tab, image) in enumerate(zip(tabs, images)):
	with tab:
	col1, col2 = st.columns(2)

	with col1:
	st.image(image, caption="Original", use_container_width=True)

	with col2:
	# Get bounding boxes
	img_byte_arr = io.BytesIO()
	image.save(img_byte_arr, format='PNG')
	response = client.models.generate_content(
	model="gemini-2.0-flash-exp",
	contents=[
	DETECTION_PROMPT,
	types.Part.from_bytes(
	data=img_byte_arr.getvalue(),
	mime_type="image/png"
	)
	]
	)

	boxes = parse_list_boxes(response.text)
	texts = [extract_text_from_region(client, image, box) for box in boxes]

	# Draw annotated image
	annotated = draw_bounding_boxes(image.copy(), boxes)
	st.image(annotated,
	caption=f"Detected {len(boxes)} text regions",
	use_container_width=True)

	# Display extracted texts
	if any(texts):
	st.subheader("Extracted Texts:")
	for i, text in enumerate(texts, 1):
	st.write(f"{i}. {text if text else 'No text detected'}")

	# Debug section
	debug_expander = st.expander("Debug Details")
	with debug_expander:
	st.write("Raw API Response:")
	st.code(response.text)
	st.write("Parsed Boxes:")
	st.write(boxes)

	except Exception as e:
	st.error(f"Error: {str(e)}")