Spaces:

MCP-1st-Birthday
/

LifeAdmin-AI

Running

File size: 4,110 Bytes

10bd609

import PyPDF2
from datetime import datetime
import re
from utils.llm_utils import get_llm_response


async def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text() + '\n'
            return text
    except Exception as e:
        # Try OCR fallback
        result = await extract_text_from_pdf_image(file_path)
        return result.get('text', '')


async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
    """
    Summarize PDF document
    
    Args:
        file_path: Path to PDF file
        max_length: Maximum summary length in words
        
    Returns:
        Dict with summary and metadata
    """
    try:
        # Extract text
        text = await extract_text_from_pdf(file_path)
        
        if not text.strip():
            return {'error': 'No text extracted from PDF'}
        
        # Create summary with LLM
        prompt = f"""Summarize the following document in {max_length} words or less. 
Be concise and capture key points, dates, amounts, and action items.

Document text:
{text[:5000]}  # Limit input

Provide a clear, structured summary."""
        
        summary = await get_llm_response(prompt, temperature=0.3)
        
        # Extract metadata
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = {
                'pages': len(pdf_reader.pages),
                'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
                'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
            }
        
        return {
            'summary': summary,
            'metadata': metadata,
            'word_count': len(summary.split()),
            'original_length': len(text.split())
        }
        
    except Exception as e:
        return {'error': str(e)}


async def extract_pdf_metadata(file_path: str) -> dict:
    """
    Extract structured metadata from PDF (dates, amounts, entities)
    
    Args:
        file_path: Path to PDF file
        
    Returns:
        Dict with extracted metadata
    """
    try:
        text = await extract_text_from_pdf(file_path)
        
        if not text.strip():
            return {'error': 'No text extracted'}
        
        # Use LLM to extract structured data
        prompt = f"""Extract structured information from this document. Return as JSON.

Document text:
{text[:3000]}

Extract and return JSON with:
- dates: list of dates found (YYYY-MM-DD format)
- amounts: list of monetary amounts with currency
- deadlines: list of deadline descriptions
- key_entities: list of important names, organizations
- document_type: type of document (invoice, contract, etc.)
- action_items: list of tasks or actions mentioned

Return ONLY valid JSON, no other text."""
        
        response = await get_llm_response(prompt, temperature=0.1)
        
        # Parse JSON from response
        import json
        response = response.strip()
        if '```json' in response:
            response = response.split('```json')[1].split('```')[0].strip()
        elif '```' in response:
            response = response.split('```')[1].split('```')[0].strip()
        
        metadata = json.loads(response)
        
        return {
            'success': True,
            'metadata': metadata,
            'text_length': len(text)
        }
        
    except Exception as e:
        # Fallback to regex extraction
        dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
        amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
        
        return {
            'success': True,
            'metadata': {
                'dates': dates[:10],
                'amounts': amounts[:10],
                'document_type': 'unknown'
            },
            'method': 'regex_fallback'
        }