Spaces:

MCP-1st-Birthday
/

LifeAdmin-AI

Running

App Files Files Community

LifeAdmin-AI / tools /pdf_server.py

Maheen001

Create tools/pdf_server.py

10bd609 verified 13 days ago

raw

history blame

4.11 kB

	import PyPDF2
	from datetime import datetime
	import re
	from utils.llm_utils import get_llm_response


	async def extract_text_from_pdf(file_path: str) -> str:
	"""Extract text from PDF"""
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ''
	for page in pdf_reader.pages:
	text += page.extract_text() + '\n'
	return text
	except Exception as e:
	# Try OCR fallback
	result = await extract_text_from_pdf_image(file_path)
	return result.get('text', '')


	async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
	"""
	Summarize PDF document

	Args:
	file_path: Path to PDF file
	max_length: Maximum summary length in words

	Returns:
	Dict with summary and metadata
	"""
	try:
	# Extract text
	text = await extract_text_from_pdf(file_path)

	if not text.strip():
	return {'error': 'No text extracted from PDF'}

	# Create summary with LLM
	prompt = f"""Summarize the following document in {max_length} words or less.
	Be concise and capture key points, dates, amounts, and action items.

	Document text:
	{text[:5000]} # Limit input

	Provide a clear, structured summary."""

	summary = await get_llm_response(prompt, temperature=0.3)

	# Extract metadata
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	metadata = {
	'pages': len(pdf_reader.pages),
	'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
	'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
	}

	return {
	'summary': summary,
	'metadata': metadata,
	'word_count': len(summary.split()),
	'original_length': len(text.split())
	}

	except Exception as e:
	return {'error': str(e)}


	async def extract_pdf_metadata(file_path: str) -> dict:
	"""
	Extract structured metadata from PDF (dates, amounts, entities)

	Args:
	file_path: Path to PDF file

	Returns:
	Dict with extracted metadata
	"""
	try:
	text = await extract_text_from_pdf(file_path)

	if not text.strip():
	return {'error': 'No text extracted'}

	# Use LLM to extract structured data
	prompt = f"""Extract structured information from this document. Return as JSON.

	Document text:
	{text[:3000]}

	Extract and return JSON with:
	- dates: list of dates found (YYYY-MM-DD format)
	- amounts: list of monetary amounts with currency
	- deadlines: list of deadline descriptions
	- key_entities: list of important names, organizations
	- document_type: type of document (invoice, contract, etc.)
	- action_items: list of tasks or actions mentioned

	Return ONLY valid JSON, no other text."""

	response = await get_llm_response(prompt, temperature=0.1)

	# Parse JSON from response
	import json
	response = response.strip()
	if '```json' in response:
	response = response.split('```json')[1].split('```')[0].strip()
	elif '```' in response:
	response = response.split('```')[1].split('```')[0].strip()

	metadata = json.loads(response)

	return {
	'success': True,
	'metadata': metadata,
	'text_length': len(text)
	}

	except Exception as e:
	# Fallback to regex extraction
	dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
	amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)

	return {
	'success': True,
	'metadata': {
	'dates': dates[:10],
	'amounts': amounts[:10],
	'document_type': 'unknown'
	},
	'method': 'regex_fallback'
	}