Maheen001 commited on
Commit
d411b46
·
verified ·
1 Parent(s): 791ca54

Create utils/pdf_utils.py

Browse files
Files changed (1) hide show
  1. utils/pdf_utils.py +96 -0
utils/pdf_utils.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from pathlib import Path
3
+ from typing import List, Dict
4
+
5
+
6
+ def extract_text_from_pdf(file_path: str) -> str:
7
+ """
8
+ Extract text from PDF file
9
+
10
+ Args:
11
+ file_path: Path to PDF file
12
+
13
+ Returns:
14
+ Extracted text
15
+ """
16
+ try:
17
+ with open(file_path, 'rb') as file:
18
+ pdf_reader = PyPDF2.PdfReader(file)
19
+ text = ''
20
+
21
+ for page in pdf_reader.pages:
22
+ text += page.extract_text() + '\n\n'
23
+
24
+ return text.strip()
25
+ except Exception as e:
26
+ print(f"PDF extraction error: {e}")
27
+ return ""
28
+
29
+
30
+ def get_pdf_metadata(file_path: str) -> Dict:
31
+ """
32
+ Get PDF metadata
33
+
34
+ Args:
35
+ file_path: Path to PDF file
36
+
37
+ Returns:
38
+ Dictionary of metadata
39
+ """
40
+ try:
41
+ with open(file_path, 'rb') as file:
42
+ pdf_reader = PyPDF2.PdfReader(file)
43
+
44
+ metadata = {
45
+ 'num_pages': len(pdf_reader.pages),
46
+ 'author': '',
47
+ 'title': '',
48
+ 'subject': '',
49
+ 'creator': ''
50
+ }
51
+
52
+ if pdf_reader.metadata:
53
+ metadata['author'] = pdf_reader.metadata.get('/Author', '')
54
+ metadata['title'] = pdf_reader.metadata.get('/Title', '')
55
+ metadata['subject'] = pdf_reader.metadata.get('/Subject', '')
56
+ metadata['creator'] = pdf_reader.metadata.get('/Creator', '')
57
+
58
+ return metadata
59
+ except Exception as e:
60
+ print(f"Metadata extraction error: {e}")
61
+ return {'error': str(e)}
62
+
63
+
64
+ def split_pdf_pages(file_path: str, output_dir: str) -> List[str]:
65
+ """
66
+ Split PDF into individual page files
67
+
68
+ Args:
69
+ file_path: Path to PDF file
70
+ output_dir: Output directory for page files
71
+
72
+ Returns:
73
+ List of output file paths
74
+ """
75
+ try:
76
+ output_path = Path(output_dir)
77
+ output_path.mkdir(parents=True, exist_ok=True)
78
+
79
+ with open(file_path, 'rb') as file:
80
+ pdf_reader = PyPDF2.PdfReader(file)
81
+ output_files = []
82
+
83
+ for i, page in enumerate(pdf_reader.pages):
84
+ writer = PyPDF2.PdfWriter()
85
+ writer.add_page(page)
86
+
87
+ output_file = output_path / f"page_{i+1}.pdf"
88
+ with open(output_file, 'wb') as output:
89
+ writer.write(output)
90
+
91
+ output_files.append(str(output_file))
92
+
93
+ return output_files
94
+ except Exception as e:
95
+ print(f"PDF split error: {e}")
96
+ return []