Spaces:
Running
Running
| import PyPDF2 | |
| from pathlib import Path | |
| from typing import List, Dict | |
| def extract_text_from_pdf(file_path: str) -> str: | |
| """ | |
| Extract text from PDF file | |
| Args: | |
| file_path: Path to PDF file | |
| Returns: | |
| Extracted text | |
| """ | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| text = '' | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + '\n\n' | |
| return text.strip() | |
| except Exception as e: | |
| print(f"PDF extraction error: {e}") | |
| return "" | |
| def get_pdf_metadata(file_path: str) -> Dict: | |
| """ | |
| Get PDF metadata | |
| Args: | |
| file_path: Path to PDF file | |
| Returns: | |
| Dictionary of metadata | |
| """ | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| metadata = { | |
| 'num_pages': len(pdf_reader.pages), | |
| 'author': '', | |
| 'title': '', | |
| 'subject': '', | |
| 'creator': '' | |
| } | |
| if pdf_reader.metadata: | |
| metadata['author'] = pdf_reader.metadata.get('/Author', '') | |
| metadata['title'] = pdf_reader.metadata.get('/Title', '') | |
| metadata['subject'] = pdf_reader.metadata.get('/Subject', '') | |
| metadata['creator'] = pdf_reader.metadata.get('/Creator', '') | |
| return metadata | |
| except Exception as e: | |
| print(f"Metadata extraction error: {e}") | |
| return {'error': str(e)} | |
| def split_pdf_pages(file_path: str, output_dir: str) -> List[str]: | |
| """ | |
| Split PDF into individual page files | |
| Args: | |
| file_path: Path to PDF file | |
| output_dir: Output directory for page files | |
| Returns: | |
| List of output file paths | |
| """ | |
| try: | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| output_files = [] | |
| for i, page in enumerate(pdf_reader.pages): | |
| writer = PyPDF2.PdfWriter() | |
| writer.add_page(page) | |
| output_file = output_path / f"page_{i+1}.pdf" | |
| with open(output_file, 'wb') as output: | |
| writer.write(output) | |
| output_files.append(str(output_file)) | |
| return output_files | |
| except Exception as e: | |
| print(f"PDF split error: {e}") | |
| return [] | |