bluenevus commited on
Commit
d23df15
Β·
verified Β·
1 Parent(s): 3115e99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -216
app.py CHANGED
@@ -1,253 +1,236 @@
1
- import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import os
5
  from urllib.parse import urljoin, urlparse
6
  import time
7
  import zipfile
 
 
 
 
8
  import shutil
9
- from datetime import datetime
10
- from selenium import webdriver
11
- from selenium.webdriver.chrome.options import Options
12
- from selenium.webdriver.chrome.service import Service
13
- from selenium.webdriver.common.by import By
14
- from selenium.webdriver.support.ui import WebDriverWait
15
- from selenium.webdriver.support import expected_conditions as EC
16
-
17
- def setup_selenium_driver():
18
- """Configure Selenium WebDriver for Hugging Face Spaces."""
19
- chrome_options = Options()
20
- chrome_options.add_argument('--headless')
21
- chrome_options.add_argument('--no-sandbox')
22
- chrome_options.add_argument('--disable-dev-shm-usage')
23
- chrome_options.add_argument('--disable-gpu')
24
- chrome_options.add_argument('--disable-blink-features=AutomationControlled')
25
- chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
26
- chrome_options.add_experimental_option('useAutomationExtension', False)
27
- chrome_options.add_argument('--window-size=1920,1080')
28
- chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
29
- chrome_options.binary_location = '/usr/bin/chromium'
30
-
31
- try:
32
- service = Service('/usr/bin/chromedriver')
33
- driver = webdriver.Chrome(service=service, options=chrome_options)
34
- driver.execute_cdp_cmd('Network.setUserAgentOverride', {
35
- "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
36
- })
37
- driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
38
- return driver
39
- except Exception as e:
40
- raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
41
 
42
- def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
43
  """
44
- Download all PDFs using Selenium WebDriver with proper session management.
 
 
 
 
 
 
 
 
 
 
45
  """
46
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
- temp_dir = f"temp_pdfs_{timestamp}"
48
- zip_filename = f"downloaded_pdfs_{timestamp}.zip"
49
- driver = None
 
50
 
51
  try:
52
- os.makedirs(temp_dir, exist_ok=True)
53
- progress(0, desc="Initializing browser...")
54
-
55
- if not url.startswith(('http://', 'https://')):
56
- return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
57
-
58
- # Initialize Selenium driver
59
- try:
60
- driver = setup_selenium_driver()
61
- except Exception as e:
62
- return None, f"❌ Error initializing browser: {str(e)}"
63
-
64
- # Navigate to the page
65
- progress(0.1, desc="Loading webpage with browser...")
66
- try:
67
- driver.get(url)
68
- WebDriverWait(driver, 20).until(
69
- EC.presence_of_element_located((By.TAG_NAME, "body"))
70
- )
71
- time.sleep(3)
72
- except Exception as e:
73
- if driver:
74
- driver.quit()
75
- return None, f"❌ Error loading webpage: {str(e)}"
76
-
77
- # Find all PDF links
78
- progress(0.2, desc="Finding PDF links...")
79
- try:
80
- page_source = driver.page_source
81
- soup = BeautifulSoup(page_source, 'html.parser')
82
-
83
- all_links = soup.find_all('a', href=True)
84
- pdf_links = []
85
-
86
- for link in all_links:
87
- href = link['href']
88
- if href.lower().endswith('.pdf'):
89
- full_url = urljoin(url, href)
90
- pdf_links.append(full_url)
91
-
92
- if len(pdf_links) == 0:
93
- pdf_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
94
- pdf_links = [elem.get_attribute('href') for elem in pdf_elements if elem.get_attribute('href')]
95
-
96
- except Exception as e:
97
- if driver:
98
- driver.quit()
99
- return None, f"❌ Error finding PDF links: {str(e)}"
100
 
101
  if len(pdf_links) == 0:
102
- if driver:
103
- driver.quit()
104
- return None, "⚠️ No PDF files found on this page."
105
-
106
- # **KEY FIX: Transfer Selenium cookies to requests session**
107
- progress(0.25, desc="Transferring session cookies...")
108
- session = requests.Session()
109
-
110
- # Copy all cookies from Selenium to requests
111
- for cookie in driver.get_cookies():
112
- session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
113
-
114
- # Copy headers from Selenium
115
- headers = {
116
- 'User-Agent': driver.execute_script("return navigator.userAgent;"),
117
- 'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
118
- 'Accept-Language': 'en-US,en;q=0.9',
119
- 'Accept-Encoding': 'gzip, deflate, br',
120
- 'Connection': 'keep-alive',
121
- 'Referer': url,
122
- }
123
- session.headers.update(headers)
124
-
125
- # Now we can close the browser
126
- driver.quit()
127
- driver = None
128
-
129
- progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
130
-
131
- # Download PDFs with the authenticated session
132
  successful = 0
133
  failed = 0
134
- status_messages = []
135
 
136
- for idx, pdf_url in enumerate(pdf_links):
137
  try:
138
- progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
139
- progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
140
-
141
- # Extract and sanitize filename
142
  parsed_url = urlparse(pdf_url)
143
- filename = os.path.basename(parsed_url.path)
 
144
 
145
- if not filename or filename == '.pdf':
146
- filename = f"document_{idx + 1}.pdf"
147
 
148
- # Remove invalid characters
149
- filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
150
- if not filename.endswith('.pdf'):
151
- filename += '.pdf'
152
 
153
- filepath = os.path.join(temp_dir, filename)
 
 
 
 
 
 
154
 
155
- # Download with retry logic
156
- max_retries = 3
157
- for attempt in range(max_retries):
158
- try:
159
- pdf_response = session.get(pdf_url, timeout=60, stream=True)
160
- pdf_response.raise_for_status()
161
-
162
- # Verify it's actually a PDF
163
- content_type = pdf_response.headers.get('content-type', '').lower()
164
- if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
165
- raise Exception(f"Not a PDF file (content-type: {content_type})")
166
-
167
- # Save PDF
168
- with open(filepath, 'wb') as f:
169
- for chunk in pdf_response.iter_content(chunk_size=8192):
170
- if chunk:
171
- f.write(chunk)
172
-
173
- # Verify file was written and has content
174
- if os.path.getsize(filepath) == 0:
175
- raise Exception("Downloaded file is empty")
176
-
177
- successful += 1
178
- break # Success, exit retry loop
179
-
180
- except Exception as e:
181
- if attempt == max_retries - 1:
182
- raise # Last attempt failed
183
- time.sleep(2) # Wait before retry
184
 
185
- # Be polite - delay between downloads
186
- time.sleep(1)
187
 
188
  except Exception as e:
189
  failed += 1
190
- status_messages.append(f"Failed: {filename} - {str(e)}")
191
- # Clean up failed download
192
- if os.path.exists(filepath):
193
- os.remove(filepath)
194
  continue
195
 
196
- if successful == 0:
197
- return None, f"❌ Failed to download any PDFs.\n\n**Possible reasons:**\n- PDFs require authentication\n- Links are not direct PDF URLs\n- Website blocking automated downloads\n\n**Failed attempts:** {len(status_messages)}\n**Sample errors:**\n" + "\n".join(status_messages[:3])
198
-
199
- # Create zip file
200
- progress(0.9, desc="Creating zip file...")
201
- zip_path = os.path.join(os.getcwd(), zip_filename)
202
-
203
- with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
204
- for root, dirs, files in os.walk(temp_dir):
205
- for file in files:
206
- file_path = os.path.join(root, file)
207
- zipf.write(file_path, os.path.basename(file_path))
208
-
209
- # Clean up
210
- shutil.rmtree(temp_dir)
211
-
212
- # Create status message
213
- status_msg = f"βœ… **Download Complete!**\n\n"
214
- status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
215
- status_msg += f"- **Successfully downloaded:** {successful}\n"
216
- status_msg += f"- **Failed:** {failed}\n"
217
- status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
218
-
219
- if failed > 0 and len(status_messages) > 0:
220
- status_msg += "\n**Failed Downloads (first 10):**\n" + "\n".join(status_messages[:10])
221
- if len(status_messages) > 10:
222
- status_msg += f"\n... and {len(status_messages) - 10} more"
 
 
 
 
 
 
 
223
 
224
- progress(1.0, desc="Complete!")
225
- return zip_path, status_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
 
 
227
  except Exception as e:
228
- if driver:
229
- driver.quit()
230
- if os.path.exists(temp_dir):
231
- shutil.rmtree(temp_dir)
232
  return None, f"❌ Unexpected error: {str(e)}"
233
 
234
- with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
235
- gr.Markdown("""
236
- # πŸ“₯ PDF Downloader & Zipper (Selenium)
237
-
238
- Downloads all PDFs from a webpage using Selenium WebDriver.
239
- """)
240
-
241
- url_input = gr.Textbox(label="Enter Webpage URL", placeholder="https://example.com/pdfs/")
242
- submit_btn = gr.Button("πŸ“₯ Download & Zip PDFs", variant="primary")
243
- status_output = gr.Markdown()
244
- download_output = gr.File(label="Download Zip File")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- submit_btn.click(
247
- fn=download_and_zip_pdfs_selenium,
248
- inputs=[url_input],
249
- outputs=[download_output, status_output]
250
- )
251
 
 
252
  if __name__ == "__main__":
253
- demo.launch()
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import os
4
  from urllib.parse import urljoin, urlparse
5
  import time
6
  import zipfile
7
+ import io
8
+ import gradio as gr
9
+ from pathlib import Path
10
+ import tempfile
11
  import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def download_pdfs_from_page(url, output_option="zip", output_dir='downloaded_pdfs', progress=gr.Progress()):
14
  """
15
+ Download all PDFs from a webpage.
16
+
17
+ Args:
18
+ url: The webpage URL to scrape
19
+ output_option: Either "zip" or "directory"
20
+ output_dir: Directory to save downloaded PDFs (only used if output_option is "directory")
21
+ progress: Gradio progress tracker
22
+
23
+ Returns:
24
+ For zip: tuple of (zip_file_path, summary_message)
25
+ For directory: tuple of (None, summary_message)
26
  """
27
+
28
+ # Set headers to mimic a browser request
29
+ headers = {
30
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
31
+ }
32
 
33
  try:
34
+ # Fetch the webpage
35
+ progress(0, desc="Fetching webpage...")
36
+ response = requests.get(url, headers=headers, timeout=30)
37
+ response.raise_for_status()
38
+
39
+ # Parse HTML
40
+ soup = BeautifulSoup(response.content, 'html.parser')
41
+
42
+ # Find all links
43
+ all_links = soup.find_all('a', href=True)
44
+
45
+ # Filter for PDF links (including those with query parameters)
46
+ pdf_links = []
47
+ for link in all_links:
48
+ href = link['href']
49
+ if '.pdf' in href.lower():
50
+ full_url = urljoin(url, href)
51
+ pdf_links.append(full_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  if len(pdf_links) == 0:
54
+ return None, "❌ No PDF links found on the page."
55
+
56
+ progress(0.1, desc=f"Found {len(pdf_links)} PDF links")
57
+
58
+ # Create temporary directory for downloads
59
+ temp_dir = tempfile.mkdtemp()
60
+
61
+ # Download each PDF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  successful = 0
63
  failed = 0
64
+ failed_urls = []
65
 
66
+ for idx, pdf_url in enumerate(pdf_links, 1):
67
  try:
68
+ # Extract filename from URL (remove query parameters)
 
 
 
69
  parsed_url = urlparse(pdf_url)
70
+ path_without_query = parsed_url.path
71
+ filename = os.path.basename(path_without_query)
72
 
73
+ # Create full file path in temp directory
74
+ filepath = os.path.join(temp_dir, filename)
75
 
76
+ # Update progress
77
+ progress((0.1 + (0.8 * idx / len(pdf_links))),
78
+ desc=f"Downloading {idx}/{len(pdf_links)}: {filename}")
 
79
 
80
+ # Download PDF
81
+ pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
82
+ pdf_response.raise_for_status()
83
+
84
+ # Save PDF
85
+ with open(filepath, 'wb') as f:
86
+ f.write(pdf_response.content)
87
 
88
+ successful += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Be polite - add a small delay between downloads
91
+ time.sleep(0.5)
92
 
93
  except Exception as e:
94
  failed += 1
95
+ failed_urls.append(f"{filename}: {str(e)}")
 
 
 
96
  continue
97
 
98
+ # Generate summary message
99
+ summary = f"""
100
+ βœ… **Download Complete!**
101
+
102
+ πŸ“Š **Summary:**
103
+ - Total PDFs found: {len(pdf_links)}
104
+ - Successfully downloaded: {successful}
105
+ - Failed: {failed}
106
+ """
107
+
108
+ if failed > 0:
109
+ summary += f"\n\n⚠️ **Failed Downloads:**\n"
110
+ for fail in failed_urls[:10]: # Show first 10 failures
111
+ summary += f"- {fail}\n"
112
+ if len(failed_urls) > 10:
113
+ summary += f"- ... and {len(failed_urls) - 10} more\n"
114
+
115
+ # Handle output based on user choice
116
+ if output_option == "zip":
117
+ progress(0.9, desc="Creating zip file...")
118
+
119
+ # Create zip file
120
+ zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
121
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
122
+ for root, dirs, files in os.walk(temp_dir):
123
+ for file in files:
124
+ file_path = os.path.join(root, file)
125
+ zipf.write(file_path, arcname=file)
126
+
127
+ # Clean up temp directory
128
+ shutil.rmtree(temp_dir)
129
+
130
+ progress(1.0, desc="Complete!")
131
+ return zip_path, summary
132
 
133
+ else: # output_option == "directory"
134
+ progress(0.9, desc="Copying files to output directory...")
135
+
136
+ # Create output directory if it doesn't exist
137
+ if not os.path.exists(output_dir):
138
+ os.makedirs(output_dir)
139
+
140
+ # Copy files from temp to output directory
141
+ for file in os.listdir(temp_dir):
142
+ shutil.copy2(os.path.join(temp_dir, file), os.path.join(output_dir, file))
143
+
144
+ # Clean up temp directory
145
+ shutil.rmtree(temp_dir)
146
+
147
+ summary += f"\n\nπŸ“ **Files saved to:** `{os.path.abspath(output_dir)}`"
148
+
149
+ progress(1.0, desc="Complete!")
150
+ return None, summary
151
 
152
+ except requests.exceptions.RequestException as e:
153
+ return None, f"❌ Error fetching webpage: {str(e)}"
154
  except Exception as e:
 
 
 
 
155
  return None, f"❌ Unexpected error: {str(e)}"
156
 
157
+ # Create Gradio interface
158
+ def create_interface():
159
+ with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
160
+ gr.Markdown(
161
+ """
162
+ # πŸ“₯ PDF Downloader
163
+ Download all PDFs from any webpage with ease!
164
+
165
+ **Instructions:**
166
+ 1. Enter the URL of the webpage containing PDF links
167
+ 2. Choose whether to download as a ZIP file or save to a directory
168
+ 3. If saving to directory, specify the directory name
169
+ 4. Click "Download PDFs"
170
+ """
171
+ )
172
+
173
+ with gr.Row():
174
+ with gr.Column():
175
+ url_input = gr.Textbox(
176
+ label="Webpage URL",
177
+ placeholder="https://example.com/pdfs",
178
+ lines=1
179
+ )
180
+
181
+ output_option = gr.Radio(
182
+ choices=["zip", "directory"],
183
+ value="zip",
184
+ label="Output Option",
185
+ info="Choose how to receive the downloaded PDFs"
186
+ )
187
+
188
+ output_dir = gr.Textbox(
189
+ label="Output Directory (only for 'directory' option)",
190
+ placeholder="downloaded_pdfs",
191
+ value="downloaded_pdfs",
192
+ lines=1,
193
+ visible=False
194
+ )
195
+
196
+ download_btn = gr.Button("πŸ“₯ Download PDFs", variant="primary", size="lg")
197
+
198
+ with gr.Row():
199
+ with gr.Column():
200
+ output_file = gr.File(label="Download ZIP", visible=True)
201
+ summary_output = gr.Markdown(label="Summary")
202
+
203
+ # Show/hide directory input based on output option
204
+ def toggle_directory_input(option):
205
+ return gr.update(visible=(option == "directory"))
206
+
207
+ output_option.change(
208
+ fn=toggle_directory_input,
209
+ inputs=[output_option],
210
+ outputs=[output_dir]
211
+ )
212
+
213
+ # Handle download button click
214
+ download_btn.click(
215
+ fn=download_pdfs_from_page,
216
+ inputs=[url_input, output_option, output_dir],
217
+ outputs=[output_file, summary_output]
218
+ )
219
+
220
+ gr.Markdown(
221
+ """
222
+ ---
223
+ ### πŸ’‘ Tips:
224
+ - The script will find all PDF links on the page, including those with query parameters
225
+ - Downloads include a small delay between requests to be respectful to servers
226
+ - ZIP files are automatically named with a timestamp
227
+ - For directory output, files are saved to your local filesystem
228
+ """
229
+ )
230
 
231
+ return demo
 
 
 
 
232
 
233
+ # Launch the interface
234
  if __name__ == "__main__":
235
+ demo = create_interface()
236
+ demo.launch(share=True)