bluenevus commited on
Commit
3d64e96
Β·
verified Β·
1 Parent(s): fabcdae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import os
5
+ from urllib.parse import urljoin, urlparse
6
+ import time
7
+ import zipfile
8
+ import shutil
9
+ from datetime import datetime
10
+
11
+ def download_and_zip_pdfs(url, progress=gr.Progress()):
12
+ """
13
+ Download all PDFs from a given URL and create a zip file.
14
+
15
+ Args:
16
+ url: The webpage URL to scrape
17
+ progress: Gradio progress tracker
18
+
19
+ Returns:
20
+ tuple: (zip_file_path, status_message)
21
+ """
22
+
23
+ # Create unique temporary directory for this session
24
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
25
+ temp_dir = f"temp_pdfs_{timestamp}"
26
+ zip_filename = f"downloaded_pdfs_{timestamp}.zip"
27
+
28
+ try:
29
+ # Create temporary directory
30
+ os.makedirs(temp_dir, exist_ok=True)
31
+ progress(0, desc="Initializing...")
32
+
33
+ # Validate URL
34
+ if not url.startswith(('http://', 'https://')):
35
+ return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
36
+
37
+ # Set headers
38
+ headers = {
39
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
40
+ }
41
+
42
+ # Fetch webpage
43
+ progress(0.1, desc="Fetching webpage...")
44
+ try:
45
+ response = requests.get(url, headers=headers, timeout=30)
46
+ response.raise_for_status()
47
+ except requests.exceptions.RequestException as e:
48
+ return None, f"❌ Error fetching webpage: {str(e)}"
49
+
50
+ # Parse HTML
51
+ progress(0.2, desc="Parsing HTML...")
52
+ soup = BeautifulSoup(response.content, 'html.parser')
53
+
54
+ # Find all PDF links
55
+ all_links = soup.find_all('a', href=True)
56
+ pdf_links = []
57
+
58
+ for link in all_links:
59
+ href = link['href']
60
+ if href.lower().endswith('.pdf'):
61
+ full_url = urljoin(url, href)
62
+ pdf_links.append(full_url)
63
+
64
+ if len(pdf_links) == 0:
65
+ return None, "⚠️ No PDF files found on this page."
66
+
67
+ progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
68
+
69
+ # Download PDFs
70
+ successful = 0
71
+ failed = 0
72
+ status_messages = []
73
+
74
+ for idx, pdf_url in enumerate(pdf_links):
75
+ try:
76
+ # Update progress
77
+ progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
78
+ progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
79
+
80
+ # Extract filename
81
+ parsed_url = urlparse(pdf_url)
82
+ filename = os.path.basename(parsed_url.path)
83
+
84
+ # Handle empty or invalid filenames
85
+ if not filename or filename == '.pdf':
86
+ filename = f"document_{idx + 1}.pdf"
87
+
88
+ filepath = os.path.join(temp_dir, filename)
89
+
90
+ # Download PDF
91
+ pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
92
+ pdf_response.raise_for_status()
93
+
94
+ # Save PDF
95
+ with open(filepath, 'wb') as f:
96
+ f.write(pdf_response.content)
97
+
98
+ successful += 1
99
+
100
+ # Small delay to be polite
101
+ time.sleep(0.5)
102
+
103
+ except Exception as e:
104
+ failed += 1
105
+ status_messages.append(f"Failed: {filename} - {str(e)}")
106
+ continue
107
+
108
+ if successful == 0:
109
+ return None, "❌ Failed to download any PDFs. Please check the URL and try again."
110
+
111
+ # Create zip file
112
+ progress(0.9, desc="Creating zip file...")
113
+ zip_path = os.path.join(os.getcwd(), zip_filename)
114
+
115
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
116
+ for root, dirs, files in os.walk(temp_dir):
117
+ for file in files:
118
+ file_path = os.path.join(root, file)
119
+ arcname = os.path.basename(file_path)
120
+ zipf.write(file_path, arcname)
121
+
122
+ # Clean up temporary directory
123
+ shutil.rmtree(temp_dir)
124
+
125
+ # Create status message
126
+ status_msg = f"βœ… **Download Complete!**\n\n"
127
+ status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
128
+ status_msg += f"- **Successfully downloaded:** {successful}\n"
129
+ status_msg += f"- **Failed:** {failed}\n"
130
+ status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
131
+
132
+ if failed > 0 and len(status_messages) > 0:
133
+ status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
134
+ if len(status_messages) > 10:
135
+ status_msg += f"\n... and {len(status_messages) - 10} more"
136
+
137
+ progress(1.0, desc="Complete!")
138
+
139
+ return zip_path, status_msg
140
+
141
+ except Exception as e:
142
+ # Clean up on error
143
+ if os.path.exists(temp_dir):
144
+ shutil.rmtree(temp_dir)
145
+ return None, f"❌ Unexpected error: {str(e)}"
146
+
147
+ # Create Gradio interface
148
+ with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
149
+
150
+ gr.Markdown("""
151
+ # πŸ“₯ PDF Downloader & Zipper
152
+
153
+ Enter a URL to download all PDF files from that webpage and receive them as a single zip file.
154
+
155
+ **Instructions:**
156
+ 1. Paste the URL of the webpage containing PDFs
157
+ 2. Click "Download & Zip PDFs"
158
+ 3. Wait for processing (may take several minutes for many PDFs)
159
+ 4. Download your zip file!
160
+ """)
161
+
162
+ with gr.Row():
163
+ url_input = gr.Textbox(
164
+ label="Enter Webpage URL",
165
+ placeholder="https://www.example.com/documents/",
166
+ lines=1,
167
+ scale=4
168
+ )
169
+
170
+ with gr.Row():
171
+ submit_btn = gr.Button("πŸ“₯ Download & Zip PDFs", variant="primary", scale=1)
172
+ clear_btn = gr.Button("πŸ”„ Clear", scale=1)
173
+
174
+ status_output = gr.Markdown(label="Status")
175
+
176
+ download_output = gr.File(label="Download Zip File")
177
+
178
+ # Examples
179
+ gr.Markdown("### Example URLs:")
180
+ gr.Examples(
181
+ examples=[
182
+ ["https://www.esd.whs.mil/Directives/issuances/dodi/"],
183
+ ],
184
+ inputs=url_input
185
+ )
186
+
187
+ # Event handlers
188
+ submit_btn.click(
189
+ fn=download_and_zip_pdfs,
190
+ inputs=[url_input],
191
+ outputs=[download_output, status_output]
192
+ )
193
+
194
+ clear_btn.click(
195
+ fn=lambda: (None, None, ""),
196
+ inputs=None,
197
+ outputs=[download_output, url_input, status_output]
198
+ )
199
+
200
+ # Launch the app
201
+ if __name__ == "__main__":
202
+ demo.launch()