Spaces:
Running
Running
| import PyPDF2 | |
| from datetime import datetime | |
| import re | |
| from utils.llm_utils import get_llm_response | |
| async def extract_text_from_pdf(file_path: str) -> str: | |
| """Extract text from PDF""" | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = '' | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + '\n' | |
| return text | |
| except Exception as e: | |
| # Try OCR fallback | |
| result = await extract_text_from_pdf_image(file_path) | |
| return result.get('text', '') | |
| async def summarize_pdf(file_path: str, max_length: int = 500) -> dict: | |
| """ | |
| Summarize PDF document | |
| Args: | |
| file_path: Path to PDF file | |
| max_length: Maximum summary length in words | |
| Returns: | |
| Dict with summary and metadata | |
| """ | |
| try: | |
| # Extract text | |
| text = await extract_text_from_pdf(file_path) | |
| if not text.strip(): | |
| return {'error': 'No text extracted from PDF'} | |
| # Create summary with LLM | |
| prompt = f"""Summarize the following document in {max_length} words or less. | |
| Be concise and capture key points, dates, amounts, and action items. | |
| Document text: | |
| {text[:5000]} # Limit input | |
| Provide a clear, structured summary.""" | |
| summary = await get_llm_response(prompt, temperature=0.3) | |
| # Extract metadata | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| metadata = { | |
| 'pages': len(pdf_reader.pages), | |
| 'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown', | |
| 'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '', | |
| } | |
| return { | |
| 'summary': summary, | |
| 'metadata': metadata, | |
| 'word_count': len(summary.split()), | |
| 'original_length': len(text.split()) | |
| } | |
| except Exception as e: | |
| return {'error': str(e)} | |
| async def extract_pdf_metadata(file_path: str) -> dict: | |
| """ | |
| Extract structured metadata from PDF (dates, amounts, entities) | |
| Args: | |
| file_path: Path to PDF file | |
| Returns: | |
| Dict with extracted metadata | |
| """ | |
| try: | |
| text = await extract_text_from_pdf(file_path) | |
| if not text.strip(): | |
| return {'error': 'No text extracted'} | |
| # Use LLM to extract structured data | |
| prompt = f"""Extract structured information from this document. Return as JSON. | |
| Document text: | |
| {text[:3000]} | |
| Extract and return JSON with: | |
| - dates: list of dates found (YYYY-MM-DD format) | |
| - amounts: list of monetary amounts with currency | |
| - deadlines: list of deadline descriptions | |
| - key_entities: list of important names, organizations | |
| - document_type: type of document (invoice, contract, etc.) | |
| - action_items: list of tasks or actions mentioned | |
| Return ONLY valid JSON, no other text.""" | |
| response = await get_llm_response(prompt, temperature=0.1) | |
| # Parse JSON from response | |
| import json | |
| response = response.strip() | |
| if '```json' in response: | |
| response = response.split('```json')[1].split('```')[0].strip() | |
| elif '```' in response: | |
| response = response.split('```')[1].split('```')[0].strip() | |
| metadata = json.loads(response) | |
| return { | |
| 'success': True, | |
| 'metadata': metadata, | |
| 'text_length': len(text) | |
| } | |
| except Exception as e: | |
| # Fallback to regex extraction | |
| dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text) | |
| amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text) | |
| return { | |
| 'success': True, | |
| 'metadata': { | |
| 'dates': dates[:10], | |
| 'amounts': amounts[:10], | |
| 'document_type': 'unknown' | |
| }, | |
| 'method': 'regex_fallback' | |
| } | |