File size: 4,110 Bytes
10bd609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import PyPDF2
from datetime import datetime
import re
from utils.llm_utils import get_llm_response


async def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ''
            for page in pdf_reader.pages:
                text += page.extract_text() + '\n'
            return text
    except Exception as e:
        # Try OCR fallback
        result = await extract_text_from_pdf_image(file_path)
        return result.get('text', '')


async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
    """
    Summarize PDF document
    
    Args:
        file_path: Path to PDF file
        max_length: Maximum summary length in words
        
    Returns:
        Dict with summary and metadata
    """
    try:
        # Extract text
        text = await extract_text_from_pdf(file_path)
        
        if not text.strip():
            return {'error': 'No text extracted from PDF'}
        
        # Create summary with LLM
        prompt = f"""Summarize the following document in {max_length} words or less. 
Be concise and capture key points, dates, amounts, and action items.

Document text:
{text[:5000]}  # Limit input

Provide a clear, structured summary."""
        
        summary = await get_llm_response(prompt, temperature=0.3)
        
        # Extract metadata
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = {
                'pages': len(pdf_reader.pages),
                'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
                'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
            }
        
        return {
            'summary': summary,
            'metadata': metadata,
            'word_count': len(summary.split()),
            'original_length': len(text.split())
        }
        
    except Exception as e:
        return {'error': str(e)}


async def extract_pdf_metadata(file_path: str) -> dict:
    """
    Extract structured metadata from PDF (dates, amounts, entities)
    
    Args:
        file_path: Path to PDF file
        
    Returns:
        Dict with extracted metadata
    """
    try:
        text = await extract_text_from_pdf(file_path)
        
        if not text.strip():
            return {'error': 'No text extracted'}
        
        # Use LLM to extract structured data
        prompt = f"""Extract structured information from this document. Return as JSON.

Document text:
{text[:3000]}

Extract and return JSON with:
- dates: list of dates found (YYYY-MM-DD format)
- amounts: list of monetary amounts with currency
- deadlines: list of deadline descriptions
- key_entities: list of important names, organizations
- document_type: type of document (invoice, contract, etc.)
- action_items: list of tasks or actions mentioned

Return ONLY valid JSON, no other text."""
        
        response = await get_llm_response(prompt, temperature=0.1)
        
        # Parse JSON from response
        import json
        response = response.strip()
        if '```json' in response:
            response = response.split('```json')[1].split('```')[0].strip()
        elif '```' in response:
            response = response.split('```')[1].split('```')[0].strip()
        
        metadata = json.loads(response)
        
        return {
            'success': True,
            'metadata': metadata,
            'text_length': len(text)
        }
        
    except Exception as e:
        # Fallback to regex extraction
        dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
        amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
        
        return {
            'success': True,
            'metadata': {
                'dates': dates[:10],
                'amounts': amounts[:10],
                'document_type': 'unknown'
            },
            'method': 'regex_fallback'
        }