import PyPDF2 from datetime import datetime import re from utils.llm_utils import get_llm_response async def extract_text_from_pdf(file_path: str) -> str: """Extract text from PDF""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = '' for page in pdf_reader.pages: text += page.extract_text() + '\n' return text except Exception as e: # Try OCR fallback result = await extract_text_from_pdf_image(file_path) return result.get('text', '') async def summarize_pdf(file_path: str, max_length: int = 500) -> dict: """ Summarize PDF document Args: file_path: Path to PDF file max_length: Maximum summary length in words Returns: Dict with summary and metadata """ try: # Extract text text = await extract_text_from_pdf(file_path) if not text.strip(): return {'error': 'No text extracted from PDF'} # Create summary with LLM prompt = f"""Summarize the following document in {max_length} words or less. Be concise and capture key points, dates, amounts, and action items. Document text: {text[:5000]} # Limit input Provide a clear, structured summary.""" summary = await get_llm_response(prompt, temperature=0.3) # Extract metadata with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) metadata = { 'pages': len(pdf_reader.pages), 'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown', 'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '', } return { 'summary': summary, 'metadata': metadata, 'word_count': len(summary.split()), 'original_length': len(text.split()) } except Exception as e: return {'error': str(e)} async def extract_pdf_metadata(file_path: str) -> dict: """ Extract structured metadata from PDF (dates, amounts, entities) Args: file_path: Path to PDF file Returns: Dict with extracted metadata """ try: text = await extract_text_from_pdf(file_path) if not text.strip(): return {'error': 'No text extracted'} # Use LLM to extract structured data prompt = f"""Extract structured information from this document. Return as JSON. Document text: {text[:3000]} Extract and return JSON with: - dates: list of dates found (YYYY-MM-DD format) - amounts: list of monetary amounts with currency - deadlines: list of deadline descriptions - key_entities: list of important names, organizations - document_type: type of document (invoice, contract, etc.) - action_items: list of tasks or actions mentioned Return ONLY valid JSON, no other text.""" response = await get_llm_response(prompt, temperature=0.1) # Parse JSON from response import json response = response.strip() if '```json' in response: response = response.split('```json')[1].split('```')[0].strip() elif '```' in response: response = response.split('```')[1].split('```')[0].strip() metadata = json.loads(response) return { 'success': True, 'metadata': metadata, 'text_length': len(text) } except Exception as e: # Fallback to regex extraction dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text) amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text) return { 'success': True, 'metadata': { 'dates': dates[:10], 'amounts': amounts[:10], 'document_type': 'unknown' }, 'method': 'regex_fallback' }