Spaces:

MCP-1st-Birthday
/

LifeAdmin-AI

Running

App Files Files Community

Maheen001 commited on 10 days ago

Commit

10bd609

verified ·

1 Parent(s): 3fe84f5

Create tools/pdf_server.py

Browse files

Files changed (1) hide show

tools/pdf_server.py +134 -0

tools/pdf_server.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import PyPDF2
+from datetime import datetime
+import re
+from utils.llm_utils import get_llm_response
+async def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from PDF"""
+    try:
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ''
+            for page in pdf_reader.pages:
+                text += page.extract_text() + '\n'
+            return text
+    except Exception as e:
+        # Try OCR fallback
+        result = await extract_text_from_pdf_image(file_path)
+        return result.get('text', '')
+async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
+    """
+    Summarize PDF document
+    Args:
+        file_path: Path to PDF file
+        max_length: Maximum summary length in words
+    Returns:
+        Dict with summary and metadata
+    """
+    try:
+        # Extract text
+        text = await extract_text_from_pdf(file_path)
+        if not text.strip():
+            return {'error': 'No text extracted from PDF'}
+        # Create summary with LLM
+        prompt = f"""Summarize the following document in {max_length} words or less.
+Be concise and capture key points, dates, amounts, and action items.
+Document text:
+{text[:5000]}  # Limit input
+Provide a clear, structured summary."""
+        summary = await get_llm_response(prompt, temperature=0.3)
+        # Extract metadata
+        with open(file_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            metadata = {
+                'pages': len(pdf_reader.pages),
+                'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
+                'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
+            }
+        return {
+            'summary': summary,
+            'metadata': metadata,
+            'word_count': len(summary.split()),
+            'original_length': len(text.split())
+        }
+    except Exception as e:
+        return {'error': str(e)}
+async def extract_pdf_metadata(file_path: str) -> dict:
+    """
+    Extract structured metadata from PDF (dates, amounts, entities)
+    Args:
+        file_path: Path to PDF file
+    Returns:
+        Dict with extracted metadata
+    """
+    try:
+        text = await extract_text_from_pdf(file_path)
+        if not text.strip():
+            return {'error': 'No text extracted'}
+        # Use LLM to extract structured data
+        prompt = f"""Extract structured information from this document. Return as JSON.
+Document text:
+{text[:3000]}
+Extract and return JSON with:
+- dates: list of dates found (YYYY-MM-DD format)
+- amounts: list of monetary amounts with currency
+- deadlines: list of deadline descriptions
+- key_entities: list of important names, organizations
+- document_type: type of document (invoice, contract, etc.)
+- action_items: list of tasks or actions mentioned
+Return ONLY valid JSON, no other text."""
+        response = await get_llm_response(prompt, temperature=0.1)
+        # Parse JSON from response
+        import json
+        response = response.strip()
+        if '```json' in response:
+            response = response.split('```json')[1].split('```')[0].strip()
+        elif '```' in response:
+            response = response.split('```')[1].split('```')[0].strip()
+        metadata = json.loads(response)
+        return {
+            'success': True,
+            'metadata': metadata,
+            'text_length': len(text)
+        }
+    except Exception as e:
+        # Fallback to regex extraction
+        dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
+        amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
+        return {
+            'success': True,
+            'metadata': {
+                'dates': dates[:10],
+                'amounts': amounts[:10],
+                'document_type': 'unknown'
+            },
+            'method': 'regex_fallback'
+        }