Maheen001 commited on
Commit
10bd609
·
verified ·
1 Parent(s): 3fe84f5

Create tools/pdf_server.py

Browse files
Files changed (1) hide show
  1. tools/pdf_server.py +134 -0
tools/pdf_server.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from datetime import datetime
3
+ import re
4
+ from utils.llm_utils import get_llm_response
5
+
6
+
7
+ async def extract_text_from_pdf(file_path: str) -> str:
8
+ """Extract text from PDF"""
9
+ try:
10
+ with open(file_path, 'rb') as file:
11
+ pdf_reader = PyPDF2.PdfReader(file)
12
+ text = ''
13
+ for page in pdf_reader.pages:
14
+ text += page.extract_text() + '\n'
15
+ return text
16
+ except Exception as e:
17
+ # Try OCR fallback
18
+ result = await extract_text_from_pdf_image(file_path)
19
+ return result.get('text', '')
20
+
21
+
22
+ async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
23
+ """
24
+ Summarize PDF document
25
+
26
+ Args:
27
+ file_path: Path to PDF file
28
+ max_length: Maximum summary length in words
29
+
30
+ Returns:
31
+ Dict with summary and metadata
32
+ """
33
+ try:
34
+ # Extract text
35
+ text = await extract_text_from_pdf(file_path)
36
+
37
+ if not text.strip():
38
+ return {'error': 'No text extracted from PDF'}
39
+
40
+ # Create summary with LLM
41
+ prompt = f"""Summarize the following document in {max_length} words or less.
42
+ Be concise and capture key points, dates, amounts, and action items.
43
+
44
+ Document text:
45
+ {text[:5000]} # Limit input
46
+
47
+ Provide a clear, structured summary."""
48
+
49
+ summary = await get_llm_response(prompt, temperature=0.3)
50
+
51
+ # Extract metadata
52
+ with open(file_path, 'rb') as file:
53
+ pdf_reader = PyPDF2.PdfReader(file)
54
+ metadata = {
55
+ 'pages': len(pdf_reader.pages),
56
+ 'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
57
+ 'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
58
+ }
59
+
60
+ return {
61
+ 'summary': summary,
62
+ 'metadata': metadata,
63
+ 'word_count': len(summary.split()),
64
+ 'original_length': len(text.split())
65
+ }
66
+
67
+ except Exception as e:
68
+ return {'error': str(e)}
69
+
70
+
71
+ async def extract_pdf_metadata(file_path: str) -> dict:
72
+ """
73
+ Extract structured metadata from PDF (dates, amounts, entities)
74
+
75
+ Args:
76
+ file_path: Path to PDF file
77
+
78
+ Returns:
79
+ Dict with extracted metadata
80
+ """
81
+ try:
82
+ text = await extract_text_from_pdf(file_path)
83
+
84
+ if not text.strip():
85
+ return {'error': 'No text extracted'}
86
+
87
+ # Use LLM to extract structured data
88
+ prompt = f"""Extract structured information from this document. Return as JSON.
89
+
90
+ Document text:
91
+ {text[:3000]}
92
+
93
+ Extract and return JSON with:
94
+ - dates: list of dates found (YYYY-MM-DD format)
95
+ - amounts: list of monetary amounts with currency
96
+ - deadlines: list of deadline descriptions
97
+ - key_entities: list of important names, organizations
98
+ - document_type: type of document (invoice, contract, etc.)
99
+ - action_items: list of tasks or actions mentioned
100
+
101
+ Return ONLY valid JSON, no other text."""
102
+
103
+ response = await get_llm_response(prompt, temperature=0.1)
104
+
105
+ # Parse JSON from response
106
+ import json
107
+ response = response.strip()
108
+ if '```json' in response:
109
+ response = response.split('```json')[1].split('```')[0].strip()
110
+ elif '```' in response:
111
+ response = response.split('```')[1].split('```')[0].strip()
112
+
113
+ metadata = json.loads(response)
114
+
115
+ return {
116
+ 'success': True,
117
+ 'metadata': metadata,
118
+ 'text_length': len(text)
119
+ }
120
+
121
+ except Exception as e:
122
+ # Fallback to regex extraction
123
+ dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
124
+ amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
125
+
126
+ return {
127
+ 'success': True,
128
+ 'metadata': {
129
+ 'dates': dates[:10],
130
+ 'amounts': amounts[:10],
131
+ 'document_type': 'unknown'
132
+ },
133
+ 'method': 'regex_fallback'
134
+ }