bluenevus commited on
Commit
9331adf
Β·
verified Β·
1 Parent(s): 87e5c3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -55
app.py CHANGED
@@ -7,72 +7,150 @@ import time
7
  import zipfile
8
  import shutil
9
  from datetime import datetime
 
 
 
 
 
 
 
10
 
11
- def download_and_zip_pdfs(url, progress=gr.Progress()):
12
  """
13
- Download all PDFs from a given URL and create a zip file.
14
- Enhanced with better headers to avoid 403 errors.
15
  """
 
16
 
17
- # Create unique temporary directory for this session
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
19
  temp_dir = f"temp_pdfs_{timestamp}"
20
  zip_filename = f"downloaded_pdfs_{timestamp}.zip"
 
21
 
22
  try:
23
  os.makedirs(temp_dir, exist_ok=True)
24
- progress(0, desc="Initializing...")
25
 
 
26
  if not url.startswith(('http://', 'https://')):
27
  return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
28
 
29
- # Enhanced headers to mimic a real browser
30
- headers = {
31
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
32
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
33
- 'Accept-Language': 'en-US,en;q=0.9',
34
- 'Accept-Encoding': 'gzip, deflate, br',
35
- 'Connection': 'keep-alive',
36
- 'Upgrade-Insecure-Requests': '1',
37
- 'Sec-Fetch-Dest': 'document',
38
- 'Sec-Fetch-Mode': 'navigate',
39
- 'Sec-Fetch-Site': 'none',
40
- 'Sec-Fetch-User': '?1',
41
- 'Cache-Control': 'max-age=0',
42
- }
43
-
44
- # Create a session to maintain cookies
45
- session = requests.Session()
46
- session.headers.update(headers)
47
-
48
- # Fetch webpage with session
49
- progress(0.1, desc="Fetching webpage...")
50
  try:
51
- response = session.get(url, timeout=30, allow_redirects=True)
52
- response.raise_for_status()
53
- except requests.exceptions.RequestException as e:
54
- return None, f"❌ Error fetching webpage: {str(e)}\n\nTip: The website may be blocking automated requests. Try using Solution 2 with Selenium."
55
 
56
- # Parse HTML
57
- progress(0.2, desc="Parsing HTML...")
58
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # Find all PDF links
61
- all_links = soup.find_all('a', href=True)
62
- pdf_links = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- for link in all_links:
65
- href = link['href']
66
- if href.lower().endswith('.pdf'):
67
- full_url = urljoin(url, href)
68
- pdf_links.append(full_url)
69
 
70
  if len(pdf_links) == 0:
71
  return None, "⚠️ No PDF files found on this page."
72
 
73
  progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
74
 
75
- # Download PDFs using the same session
 
 
 
 
 
 
 
 
 
 
 
 
76
  successful = 0
77
  failed = 0
78
  status_messages = []
@@ -82,26 +160,30 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
82
  progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
83
  progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
84
 
 
85
  parsed_url = urlparse(pdf_url)
86
  filename = os.path.basename(parsed_url.path)
87
 
88
  if not filename or filename == '.pdf':
89
  filename = f"document_{idx + 1}.pdf"
90
 
 
 
91
  filepath = os.path.join(temp_dir, filename)
92
 
93
- # Download PDF with session
94
  pdf_response = session.get(pdf_url, timeout=60, stream=True)
95
  pdf_response.raise_for_status()
96
 
97
  # Save PDF
98
  with open(filepath, 'wb') as f:
99
  for chunk in pdf_response.iter_content(chunk_size=8192):
100
- f.write(chunk)
 
101
 
102
  successful += 1
103
 
104
- # Polite delay
105
  time.sleep(1)
106
 
107
  except Exception as e:
@@ -110,7 +192,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
110
  continue
111
 
112
  if successful == 0:
113
- return None, "❌ Failed to download any PDFs. The website may have strong anti-bot protection."
114
 
115
  # Create zip file
116
  progress(0.9, desc="Creating zip file...")
@@ -123,7 +205,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
123
  arcname = os.path.basename(file_path)
124
  zipf.write(file_path, arcname)
125
 
126
- # Clean up
127
  shutil.rmtree(temp_dir)
128
 
129
  # Create status message
@@ -132,6 +214,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
132
  status_msg += f"- **Successfully downloaded:** {successful}\n"
133
  status_msg += f"- **Failed:** {failed}\n"
134
  status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
 
135
 
136
  if failed > 0 and len(status_messages) > 0:
137
  status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
@@ -143,39 +226,48 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
143
  return zip_path, status_msg
144
 
145
  except Exception as e:
 
 
146
  if os.path.exists(temp_dir):
147
  shutil.rmtree(temp_dir)
148
  return None, f"❌ Unexpected error: {str(e)}"
149
 
150
  # Create Gradio interface
151
- with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
152
 
153
  gr.Markdown("""
154
- # πŸ“₯ PDF Downloader & Zipper
 
 
155
 
156
- Enter a URL to download all PDF files from that webpage and receive them as a single zip file.
 
 
 
 
157
 
158
  **Instructions:**
159
  1. Paste the URL of the webpage containing PDFs
160
  2. Click "Download & Zip PDFs"
161
- 3. Wait for processing (may take several minutes for many PDFs)
162
  4. Download your zip file!
 
 
163
  """)
164
 
165
  with gr.Row():
166
  url_input = gr.Textbox(
167
  label="Enter Webpage URL",
168
- placeholder="https://www.example.com/documents/",
169
  lines=1,
170
  scale=4
171
  )
172
 
173
  with gr.Row():
174
- submit_btn = gr.Button("πŸ“₯ Download & Zip PDFs", variant="primary", scale=1)
175
  clear_btn = gr.Button("πŸ”„ Clear", scale=1)
176
 
177
  status_output = gr.Markdown(label="Status")
178
-
179
  download_output = gr.File(label="Download Zip File")
180
 
181
  # Examples
@@ -189,7 +281,7 @@ with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
189
 
190
  # Event handlers
191
  submit_btn.click(
192
- fn=download_and_zip_pdfs,
193
  inputs=[url_input],
194
  outputs=[download_output, status_output]
195
  )
@@ -200,6 +292,5 @@ with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
200
  outputs=[download_output, url_input, status_output]
201
  )
202
 
203
- # Launch the app
204
  if __name__ == "__main__":
205
  demo.launch()
 
7
  import zipfile
8
  import shutil
9
  from datetime import datetime
10
+ from selenium import webdriver
11
+ from selenium.webdriver.chrome.service import Service
12
+ from selenium.webdriver.chrome.options import Options
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+ from selenium.webdriver.support import expected_conditions as EC
16
+ from webdriver_manager.chrome import ChromeDriverManager
17
 
18
+ def setup_selenium_driver():
19
  """
20
+ Configure and return a Selenium WebDriver optimized for Hugging Face.
 
21
  """
22
+ chrome_options = Options()
23
 
24
+ # Essential options for cloud environments
25
+ chrome_options.add_argument('--headless')
26
+ chrome_options.add_argument('--no-sandbox')
27
+ chrome_options.add_argument('--disable-dev-shm-usage')
28
+ chrome_options.add_argument('--disable-gpu')
29
+ chrome_options.add_argument('--disable-software-rasterizer')
30
+
31
+ # Anti-detection options
32
+ chrome_options.add_argument('--disable-blink-features=AutomationControlled')
33
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
34
+ chrome_options.add_experimental_option('useAutomationExtension', False)
35
+
36
+ # Mimic real browser
37
+ chrome_options.add_argument('--window-size=1920,1080')
38
+ chrome_options.add_argument('--start-maximized')
39
+ chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
40
+
41
+ # Additional privacy/security settings
42
+ chrome_options.add_argument('--disable-extensions')
43
+ chrome_options.add_argument('--disable-infobars')
44
+ chrome_options.add_argument('--disable-notifications')
45
+
46
+ # Performance optimizations
47
+ chrome_options.add_argument('--disable-images') # Faster loading
48
+ chrome_options.page_load_strategy = 'eager' # Don't wait for all resources
49
+
50
+ try:
51
+ service = Service(ChromeDriverManager().install())
52
+ driver = webdriver.Chrome(service=service, options=chrome_options)
53
+
54
+ # Additional anti-detection JavaScript
55
+ driver.execute_cdp_cmd('Network.setUserAgentOverride', {
56
+ "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
57
+ })
58
+ driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
59
+
60
+ return driver
61
+ except Exception as e:
62
+ raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
63
+
64
+ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
65
+ """
66
+ Download all PDFs using Selenium WebDriver and create a zip file.
67
+ """
68
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
69
  temp_dir = f"temp_pdfs_{timestamp}"
70
  zip_filename = f"downloaded_pdfs_{timestamp}.zip"
71
+ driver = None
72
 
73
  try:
74
  os.makedirs(temp_dir, exist_ok=True)
75
+ progress(0, desc="Initializing browser...")
76
 
77
+ # Validate URL
78
  if not url.startswith(('http://', 'https://')):
79
  return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
80
 
81
+ # Initialize Selenium driver
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
+ driver = setup_selenium_driver()
84
+ except Exception as e:
85
+ return None, f"❌ Error initializing browser: {str(e)}"
 
86
 
87
+ # Navigate to the page
88
+ progress(0.1, desc="Loading webpage with browser...")
89
+ try:
90
+ driver.get(url)
91
+
92
+ # Wait for page to load (adjust timeout as needed)
93
+ WebDriverWait(driver, 20).until(
94
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
95
+ )
96
+
97
+ # Additional wait for dynamic content
98
+ time.sleep(3)
99
+
100
+ except Exception as e:
101
+ if driver:
102
+ driver.quit()
103
+ return None, f"❌ Error loading webpage: {str(e)}"
104
 
105
  # Find all PDF links
106
+ progress(0.2, desc="Finding PDF links...")
107
+ try:
108
+ # Get page source and parse with BeautifulSoup
109
+ page_source = driver.page_source
110
+ soup = BeautifulSoup(page_source, 'html.parser')
111
+
112
+ # Find all links
113
+ all_links = soup.find_all('a', href=True)
114
+ pdf_links = []
115
+
116
+ for link in all_links:
117
+ href = link['href']
118
+ if href.lower().endswith('.pdf'):
119
+ full_url = urljoin(url, href)
120
+ pdf_links.append(full_url)
121
+
122
+ # Alternative: Use Selenium directly to find PDF links
123
+ if len(pdf_links) == 0:
124
+ pdf_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
125
+ pdf_links = [elem.get_attribute('href') for elem in pdf_elements if elem.get_attribute('href')]
126
+
127
+ except Exception as e:
128
+ if driver:
129
+ driver.quit()
130
+ return None, f"❌ Error finding PDF links: {str(e)}"
131
 
132
+ # Close the browser - we don't need it anymore
133
+ driver.quit()
134
+ driver = None
 
 
135
 
136
  if len(pdf_links) == 0:
137
  return None, "⚠️ No PDF files found on this page."
138
 
139
  progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
140
 
141
+ # Download PDFs using requests with enhanced headers
142
+ headers = {
143
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
144
+ 'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
145
+ 'Accept-Language': 'en-US,en;q=0.9',
146
+ 'Accept-Encoding': 'gzip, deflate, br',
147
+ 'Connection': 'keep-alive',
148
+ 'Referer': url,
149
+ }
150
+
151
+ session = requests.Session()
152
+ session.headers.update(headers)
153
+
154
  successful = 0
155
  failed = 0
156
  status_messages = []
 
160
  progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
161
  progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
162
 
163
+ # Extract filename
164
  parsed_url = urlparse(pdf_url)
165
  filename = os.path.basename(parsed_url.path)
166
 
167
  if not filename or filename == '.pdf':
168
  filename = f"document_{idx + 1}.pdf"
169
 
170
+ # Sanitize filename
171
+ filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
172
  filepath = os.path.join(temp_dir, filename)
173
 
174
+ # Download PDF
175
  pdf_response = session.get(pdf_url, timeout=60, stream=True)
176
  pdf_response.raise_for_status()
177
 
178
  # Save PDF
179
  with open(filepath, 'wb') as f:
180
  for chunk in pdf_response.iter_content(chunk_size=8192):
181
+ if chunk:
182
+ f.write(chunk)
183
 
184
  successful += 1
185
 
186
+ # Be polite - delay between downloads
187
  time.sleep(1)
188
 
189
  except Exception as e:
 
192
  continue
193
 
194
  if successful == 0:
195
+ return None, "❌ Failed to download any PDFs. Please check the URL and try again."
196
 
197
  # Create zip file
198
  progress(0.9, desc="Creating zip file...")
 
205
  arcname = os.path.basename(file_path)
206
  zipf.write(file_path, arcname)
207
 
208
+ # Clean up temporary directory
209
  shutil.rmtree(temp_dir)
210
 
211
  # Create status message
 
214
  status_msg += f"- **Successfully downloaded:** {successful}\n"
215
  status_msg += f"- **Failed:** {failed}\n"
216
  status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
217
+ status_msg += f"**Method:** Selenium WebDriver (bypasses most anti-bot protection)\n"
218
 
219
  if failed > 0 and len(status_messages) > 0:
220
  status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
 
226
  return zip_path, status_msg
227
 
228
  except Exception as e:
229
+ if driver:
230
+ driver.quit()
231
  if os.path.exists(temp_dir):
232
  shutil.rmtree(temp_dir)
233
  return None, f"❌ Unexpected error: {str(e)}"
234
 
235
  # Create Gradio interface
236
+ with gr.Blocks(title="PDF Downloader & Zipper (Selenium)", theme=gr.themes.Soft()) as demo:
237
 
238
  gr.Markdown("""
239
+ # πŸ“₯ PDF Downloader & Zipper (Selenium WebDriver)
240
+
241
+ This version uses **Selenium WebDriver** to bypass anti-bot protection and access websites that block cloud platforms.
242
 
243
+ **How it works:**
244
+ - Uses a real Chrome browser (headless) to load the page
245
+ - Mimics human browsing behavior
246
+ - Bypasses most IP-based blocking and anti-bot measures
247
+ - Works with government sites and other restricted sources
248
 
249
  **Instructions:**
250
  1. Paste the URL of the webpage containing PDFs
251
  2. Click "Download & Zip PDFs"
252
+ 3. Wait for processing (first run may take longer to download Chrome driver)
253
  4. Download your zip file!
254
+
255
+ ⚠️ **Note:** First run will download ChromeDriver (~10MB) - this is normal and only happens once.
256
  """)
257
 
258
  with gr.Row():
259
  url_input = gr.Textbox(
260
  label="Enter Webpage URL",
261
+ placeholder="https://www.esd.whs.mil/Directives/issuances/dodi/",
262
  lines=1,
263
  scale=4
264
  )
265
 
266
  with gr.Row():
267
+ submit_btn = gr.Button("πŸ“₯ Download & Zip PDFs (Selenium)", variant="primary", scale=1)
268
  clear_btn = gr.Button("πŸ”„ Clear", scale=1)
269
 
270
  status_output = gr.Markdown(label="Status")
 
271
  download_output = gr.File(label="Download Zip File")
272
 
273
  # Examples
 
281
 
282
  # Event handlers
283
  submit_btn.click(
284
+ fn=download_and_zip_pdfs_selenium,
285
  inputs=[url_input],
286
  outputs=[download_output, status_output]
287
  )
 
292
  outputs=[download_output, url_input, status_output]
293
  )
294
 
 
295
  if __name__ == "__main__":
296
  demo.launch()