LifeAdmin-AI / tools /pdf_server.py
Maheen001's picture
Create tools/pdf_server.py
10bd609 verified
raw
history blame
4.11 kB
import PyPDF2
from datetime import datetime
import re
from utils.llm_utils import get_llm_response
async def extract_text_from_pdf(file_path: str) -> str:
"""Extract text from PDF"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page in pdf_reader.pages:
text += page.extract_text() + '\n'
return text
except Exception as e:
# Try OCR fallback
result = await extract_text_from_pdf_image(file_path)
return result.get('text', '')
async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
"""
Summarize PDF document
Args:
file_path: Path to PDF file
max_length: Maximum summary length in words
Returns:
Dict with summary and metadata
"""
try:
# Extract text
text = await extract_text_from_pdf(file_path)
if not text.strip():
return {'error': 'No text extracted from PDF'}
# Create summary with LLM
prompt = f"""Summarize the following document in {max_length} words or less.
Be concise and capture key points, dates, amounts, and action items.
Document text:
{text[:5000]} # Limit input
Provide a clear, structured summary."""
summary = await get_llm_response(prompt, temperature=0.3)
# Extract metadata
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = {
'pages': len(pdf_reader.pages),
'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
}
return {
'summary': summary,
'metadata': metadata,
'word_count': len(summary.split()),
'original_length': len(text.split())
}
except Exception as e:
return {'error': str(e)}
async def extract_pdf_metadata(file_path: str) -> dict:
"""
Extract structured metadata from PDF (dates, amounts, entities)
Args:
file_path: Path to PDF file
Returns:
Dict with extracted metadata
"""
try:
text = await extract_text_from_pdf(file_path)
if not text.strip():
return {'error': 'No text extracted'}
# Use LLM to extract structured data
prompt = f"""Extract structured information from this document. Return as JSON.
Document text:
{text[:3000]}
Extract and return JSON with:
- dates: list of dates found (YYYY-MM-DD format)
- amounts: list of monetary amounts with currency
- deadlines: list of deadline descriptions
- key_entities: list of important names, organizations
- document_type: type of document (invoice, contract, etc.)
- action_items: list of tasks or actions mentioned
Return ONLY valid JSON, no other text."""
response = await get_llm_response(prompt, temperature=0.1)
# Parse JSON from response
import json
response = response.strip()
if '```json' in response:
response = response.split('```json')[1].split('```')[0].strip()
elif '```' in response:
response = response.split('```')[1].split('```')[0].strip()
metadata = json.loads(response)
return {
'success': True,
'metadata': metadata,
'text_length': len(text)
}
except Exception as e:
# Fallback to regex extraction
dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
return {
'success': True,
'metadata': {
'dates': dates[:10],
'amounts': amounts[:10],
'document_type': 'unknown'
},
'method': 'regex_fallback'
}