Spaces:
Running
Running
File size: 2,694 Bytes
d411b46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import PyPDF2
from pathlib import Path
from typing import List, Dict
def extract_text_from_pdf(file_path: str) -> str:
"""
Extract text from PDF file
Args:
file_path: Path to PDF file
Returns:
Extracted text
"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page in pdf_reader.pages:
text += page.extract_text() + '\n\n'
return text.strip()
except Exception as e:
print(f"PDF extraction error: {e}")
return ""
def get_pdf_metadata(file_path: str) -> Dict:
"""
Get PDF metadata
Args:
file_path: Path to PDF file
Returns:
Dictionary of metadata
"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = {
'num_pages': len(pdf_reader.pages),
'author': '',
'title': '',
'subject': '',
'creator': ''
}
if pdf_reader.metadata:
metadata['author'] = pdf_reader.metadata.get('/Author', '')
metadata['title'] = pdf_reader.metadata.get('/Title', '')
metadata['subject'] = pdf_reader.metadata.get('/Subject', '')
metadata['creator'] = pdf_reader.metadata.get('/Creator', '')
return metadata
except Exception as e:
print(f"Metadata extraction error: {e}")
return {'error': str(e)}
def split_pdf_pages(file_path: str, output_dir: str) -> List[str]:
"""
Split PDF into individual page files
Args:
file_path: Path to PDF file
output_dir: Output directory for page files
Returns:
List of output file paths
"""
try:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
output_files = []
for i, page in enumerate(pdf_reader.pages):
writer = PyPDF2.PdfWriter()
writer.add_page(page)
output_file = output_path / f"page_{i+1}.pdf"
with open(output_file, 'wb') as output:
writer.write(output)
output_files.append(str(output_file))
return output_files
except Exception as e:
print(f"PDF split error: {e}")
return []
|