Spaces:
Running
Running
File size: 4,110 Bytes
10bd609 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import PyPDF2
from datetime import datetime
import re
from utils.llm_utils import get_llm_response
async def extract_text_from_pdf(file_path: str) -> str:
"""Extract text from PDF"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page in pdf_reader.pages:
text += page.extract_text() + '\n'
return text
except Exception as e:
# Try OCR fallback
result = await extract_text_from_pdf_image(file_path)
return result.get('text', '')
async def summarize_pdf(file_path: str, max_length: int = 500) -> dict:
"""
Summarize PDF document
Args:
file_path: Path to PDF file
max_length: Maximum summary length in words
Returns:
Dict with summary and metadata
"""
try:
# Extract text
text = await extract_text_from_pdf(file_path)
if not text.strip():
return {'error': 'No text extracted from PDF'}
# Create summary with LLM
prompt = f"""Summarize the following document in {max_length} words or less.
Be concise and capture key points, dates, amounts, and action items.
Document text:
{text[:5000]} # Limit input
Provide a clear, structured summary."""
summary = await get_llm_response(prompt, temperature=0.3)
# Extract metadata
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = {
'pages': len(pdf_reader.pages),
'author': pdf_reader.metadata.get('/Author', 'Unknown') if pdf_reader.metadata else 'Unknown',
'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
}
return {
'summary': summary,
'metadata': metadata,
'word_count': len(summary.split()),
'original_length': len(text.split())
}
except Exception as e:
return {'error': str(e)}
async def extract_pdf_metadata(file_path: str) -> dict:
"""
Extract structured metadata from PDF (dates, amounts, entities)
Args:
file_path: Path to PDF file
Returns:
Dict with extracted metadata
"""
try:
text = await extract_text_from_pdf(file_path)
if not text.strip():
return {'error': 'No text extracted'}
# Use LLM to extract structured data
prompt = f"""Extract structured information from this document. Return as JSON.
Document text:
{text[:3000]}
Extract and return JSON with:
- dates: list of dates found (YYYY-MM-DD format)
- amounts: list of monetary amounts with currency
- deadlines: list of deadline descriptions
- key_entities: list of important names, organizations
- document_type: type of document (invoice, contract, etc.)
- action_items: list of tasks or actions mentioned
Return ONLY valid JSON, no other text."""
response = await get_llm_response(prompt, temperature=0.1)
# Parse JSON from response
import json
response = response.strip()
if '```json' in response:
response = response.split('```json')[1].split('```')[0].strip()
elif '```' in response:
response = response.split('```')[1].split('```')[0].strip()
metadata = json.loads(response)
return {
'success': True,
'metadata': metadata,
'text_length': len(text)
}
except Exception as e:
# Fallback to regex extraction
dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text)
amounts = re.findall(r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?', text)
return {
'success': True,
'metadata': {
'dates': dates[:10],
'amounts': amounts[:10],
'document_type': 'unknown'
},
'method': 'regex_fallback'
}
|