LifeAdmin-AI / utils /pdf_utils.py
Maheen001's picture
Create utils/pdf_utils.py
d411b46 verified
raw
history blame
2.69 kB
import PyPDF2
from pathlib import Path
from typing import List, Dict
def extract_text_from_pdf(file_path: str) -> str:
"""
Extract text from PDF file
Args:
file_path: Path to PDF file
Returns:
Extracted text
"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page in pdf_reader.pages:
text += page.extract_text() + '\n\n'
return text.strip()
except Exception as e:
print(f"PDF extraction error: {e}")
return ""
def get_pdf_metadata(file_path: str) -> Dict:
"""
Get PDF metadata
Args:
file_path: Path to PDF file
Returns:
Dictionary of metadata
"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = {
'num_pages': len(pdf_reader.pages),
'author': '',
'title': '',
'subject': '',
'creator': ''
}
if pdf_reader.metadata:
metadata['author'] = pdf_reader.metadata.get('/Author', '')
metadata['title'] = pdf_reader.metadata.get('/Title', '')
metadata['subject'] = pdf_reader.metadata.get('/Subject', '')
metadata['creator'] = pdf_reader.metadata.get('/Creator', '')
return metadata
except Exception as e:
print(f"Metadata extraction error: {e}")
return {'error': str(e)}
def split_pdf_pages(file_path: str, output_dir: str) -> List[str]:
"""
Split PDF into individual page files
Args:
file_path: Path to PDF file
output_dir: Output directory for page files
Returns:
List of output file paths
"""
try:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
output_files = []
for i, page in enumerate(pdf_reader.pages):
writer = PyPDF2.PdfWriter()
writer.add_page(page)
output_file = output_path / f"page_{i+1}.pdf"
with open(output_file, 'wb') as output:
writer.write(output)
output_files.append(str(output_file))
return output_files
except Exception as e:
print(f"PDF split error: {e}")
return []