import PyPDF2 from pathlib import Path from typing import List, Dict def extract_text_from_pdf(file_path: str) -> str: """ Extract text from PDF file Args: file_path: Path to PDF file Returns: Extracted text """ try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text = '' for page in pdf_reader.pages: text += page.extract_text() + '\n\n' return text.strip() except Exception as e: print(f"PDF extraction error: {e}") return "" def get_pdf_metadata(file_path: str) -> Dict: """ Get PDF metadata Args: file_path: Path to PDF file Returns: Dictionary of metadata """ try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) metadata = { 'num_pages': len(pdf_reader.pages), 'author': '', 'title': '', 'subject': '', 'creator': '' } if pdf_reader.metadata: metadata['author'] = pdf_reader.metadata.get('/Author', '') metadata['title'] = pdf_reader.metadata.get('/Title', '') metadata['subject'] = pdf_reader.metadata.get('/Subject', '') metadata['creator'] = pdf_reader.metadata.get('/Creator', '') return metadata except Exception as e: print(f"Metadata extraction error: {e}") return {'error': str(e)} def split_pdf_pages(file_path: str, output_dir: str) -> List[str]: """ Split PDF into individual page files Args: file_path: Path to PDF file output_dir: Output directory for page files Returns: List of output file paths """ try: output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) output_files = [] for i, page in enumerate(pdf_reader.pages): writer = PyPDF2.PdfWriter() writer.add_page(page) output_file = output_path / f"page_{i+1}.pdf" with open(output_file, 'wb') as output: writer.write(output) output_files.append(str(output_file)) return output_files except Exception as e: print(f"PDF split error: {e}") return []