bluenevus commited on
Commit
ea2d648
·
verified ·
1 Parent(s): 274f96e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -67
app.py CHANGED
@@ -8,63 +8,40 @@ import zipfile
8
  import shutil
9
  from datetime import datetime
10
  from selenium import webdriver
11
- from selenium.webdriver.chrome.service import Service
12
  from selenium.webdriver.chrome.options import Options
 
13
  from selenium.webdriver.common.by import By
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
 
17
  def setup_selenium_driver():
18
- """
19
- Configure Selenium WebDriver for Hugging Face Spaces.
20
- Uses system-installed Chromium instead of webdriver-manager.
21
- """
22
  chrome_options = Options()
23
-
24
- # Essential options for cloud environments
25
  chrome_options.add_argument('--headless')
26
  chrome_options.add_argument('--no-sandbox')
27
  chrome_options.add_argument('--disable-dev-shm-usage')
28
  chrome_options.add_argument('--disable-gpu')
29
- chrome_options.add_argument('--disable-software-rasterizer')
30
-
31
- # Anti-detection options
32
  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
33
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
34
  chrome_options.add_experimental_option('useAutomationExtension', False)
35
-
36
- # Mimic real browser
37
  chrome_options.add_argument('--window-size=1920,1080')
38
  chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
39
-
40
- # Performance optimizations
41
- chrome_options.add_argument('--disable-extensions')
42
- chrome_options.add_argument('--disable-infobars')
43
- chrome_options.add_argument('--disable-notifications')
44
- chrome_options.add_argument('--disable-images')
45
- chrome_options.page_load_strategy = 'eager'
46
-
47
- # Specify binary locations for Hugging Face
48
  chrome_options.binary_location = '/usr/bin/chromium'
49
 
50
  try:
51
- # Use system ChromeDriver path
52
  service = Service('/usr/bin/chromedriver')
53
  driver = webdriver.Chrome(service=service, options=chrome_options)
54
-
55
- # Additional anti-detection JavaScript
56
  driver.execute_cdp_cmd('Network.setUserAgentOverride', {
57
  "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
58
  })
59
  driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
60
-
61
  return driver
62
  except Exception as e:
63
  raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
64
 
65
  def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
66
  """
67
- Download all PDFs using Selenium WebDriver and create a zip file.
68
  """
69
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
70
  temp_dir = f"temp_pdfs_{timestamp}"
@@ -82,7 +59,7 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
82
  try:
83
  driver = setup_selenium_driver()
84
  except Exception as e:
85
- return None, f"❌ Error initializing browser: {str(e)}\n\n**Troubleshooting:**\n- Ensure Chrome/Chromium is installed\n- Check if ChromeDriver is accessible\n- Verify Hugging Face Space has necessary packages"
86
 
87
  # Navigate to the page
88
  progress(0.1, desc="Loading webpage with browser...")
@@ -92,7 +69,6 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
92
  EC.presence_of_element_located((By.TAG_NAME, "body"))
93
  )
94
  time.sleep(3)
95
-
96
  except Exception as e:
97
  if driver:
98
  driver.quit()
@@ -122,25 +98,37 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
122
  driver.quit()
123
  return None, f"❌ Error finding PDF links: {str(e)}"
124
 
125
- driver.quit()
126
- driver = None
127
-
128
  if len(pdf_links) == 0:
 
 
129
  return None, "⚠️ No PDF files found on this page."
130
 
131
- progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
 
 
132
 
133
- # Download PDFs
 
 
 
 
134
  headers = {
135
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
136
  'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
137
  'Accept-Language': 'en-US,en;q=0.9',
 
 
138
  'Referer': url,
139
  }
140
-
141
- session = requests.Session()
142
  session.headers.update(headers)
143
 
 
 
 
 
 
 
 
144
  successful = 0
145
  failed = 0
146
  status_messages = []
@@ -150,33 +138,63 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
150
  progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
151
  progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
152
 
 
153
  parsed_url = urlparse(pdf_url)
154
  filename = os.path.basename(parsed_url.path)
155
 
156
  if not filename or filename == '.pdf':
157
  filename = f"document_{idx + 1}.pdf"
158
 
 
159
  filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
160
- filepath = os.path.join(temp_dir, filename)
 
161
 
162
- pdf_response = session.get(pdf_url, timeout=60, stream=True)
163
- pdf_response.raise_for_status()
164
 
165
- with open(filepath, 'wb') as f:
166
- for chunk in pdf_response.iter_content(chunk_size=8192):
167
- if chunk:
168
- f.write(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- successful += 1
171
  time.sleep(1)
172
 
173
  except Exception as e:
174
  failed += 1
175
  status_messages.append(f"Failed: {filename} - {str(e)}")
 
 
 
176
  continue
177
 
178
  if successful == 0:
179
- return None, "❌ Failed to download any PDFs."
180
 
181
  # Create zip file
182
  progress(0.9, desc="Creating zip file...")
@@ -188,8 +206,10 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
188
  file_path = os.path.join(root, file)
189
  zipf.write(file_path, os.path.basename(file_path))
190
 
 
191
  shutil.rmtree(temp_dir)
192
 
 
193
  status_msg = f"✅ **Download Complete!**\n\n"
194
  status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
195
  status_msg += f"- **Successfully downloaded:** {successful}\n"
@@ -197,7 +217,9 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
197
  status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
198
 
199
  if failed > 0 and len(status_messages) > 0:
200
- status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
 
 
201
 
202
  progress(1.0, desc="Complete!")
203
  return zip_path, status_msg
@@ -209,24 +231,4 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
209
  shutil.rmtree(temp_dir)
210
  return None, f"❌ Unexpected error: {str(e)}"
211
 
212
- # Gradio interface (same as before)
213
- with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
214
- gr.Markdown("""
215
- # 📥 PDF Downloader & Zipper (Selenium)
216
-
217
- Downloads all PDFs from a webpage using Selenium WebDriver.
218
- """)
219
-
220
- url_input = gr.Textbox(label="Enter Webpage URL", placeholder="https://example.com/pdfs/")
221
- submit_btn = gr.Button("📥 Download & Zip PDFs", variant="primary")
222
- status_output = gr.Markdown()
223
- download_output = gr.File(label="Download Zip File")
224
-
225
- submit_btn.click(
226
- fn=download_and_zip_pdfs_selenium,
227
- inputs=[url_input],
228
- outputs=[download_output, status_output]
229
- )
230
-
231
- if __name__ == "__main__":
232
- demo.launch()
 
8
  import shutil
9
  from datetime import datetime
10
  from selenium import webdriver
 
11
  from selenium.webdriver.chrome.options import Options
12
+ from selenium.webdriver.chrome.service import Service
13
  from selenium.webdriver.common.by import By
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
 
17
  def setup_selenium_driver():
18
+ """Configure Selenium WebDriver for Hugging Face Spaces."""
 
 
 
19
  chrome_options = Options()
 
 
20
  chrome_options.add_argument('--headless')
21
  chrome_options.add_argument('--no-sandbox')
22
  chrome_options.add_argument('--disable-dev-shm-usage')
23
  chrome_options.add_argument('--disable-gpu')
 
 
 
24
  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
25
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
26
  chrome_options.add_experimental_option('useAutomationExtension', False)
 
 
27
  chrome_options.add_argument('--window-size=1920,1080')
28
  chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
 
 
 
 
 
 
 
 
 
29
  chrome_options.binary_location = '/usr/bin/chromium'
30
 
31
  try:
 
32
  service = Service('/usr/bin/chromedriver')
33
  driver = webdriver.Chrome(service=service, options=chrome_options)
 
 
34
  driver.execute_cdp_cmd('Network.setUserAgentOverride', {
35
  "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
36
  })
37
  driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
 
38
  return driver
39
  except Exception as e:
40
  raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
41
 
42
  def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
43
  """
44
+ Download all PDFs using Selenium WebDriver with proper session management.
45
  """
46
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
  temp_dir = f"temp_pdfs_{timestamp}"
 
59
  try:
60
  driver = setup_selenium_driver()
61
  except Exception as e:
62
+ return None, f"❌ Error initializing browser: {str(e)}"
63
 
64
  # Navigate to the page
65
  progress(0.1, desc="Loading webpage with browser...")
 
69
  EC.presence_of_element_located((By.TAG_NAME, "body"))
70
  )
71
  time.sleep(3)
 
72
  except Exception as e:
73
  if driver:
74
  driver.quit()
 
98
  driver.quit()
99
  return None, f"❌ Error finding PDF links: {str(e)}"
100
 
 
 
 
101
  if len(pdf_links) == 0:
102
+ if driver:
103
+ driver.quit()
104
  return None, "⚠️ No PDF files found on this page."
105
 
106
+ # **KEY FIX: Transfer Selenium cookies to requests session**
107
+ progress(0.25, desc="Transferring session cookies...")
108
+ session = requests.Session()
109
 
110
+ # Copy all cookies from Selenium to requests
111
+ for cookie in driver.get_cookies():
112
+ session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
113
+
114
+ # Copy headers from Selenium
115
  headers = {
116
+ 'User-Agent': driver.execute_script("return navigator.userAgent;"),
117
  'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
118
  'Accept-Language': 'en-US,en;q=0.9',
119
+ 'Accept-Encoding': 'gzip, deflate, br',
120
+ 'Connection': 'keep-alive',
121
  'Referer': url,
122
  }
 
 
123
  session.headers.update(headers)
124
 
125
+ # Now we can close the browser
126
+ driver.quit()
127
+ driver = None
128
+
129
+ progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
130
+
131
+ # Download PDFs with the authenticated session
132
  successful = 0
133
  failed = 0
134
  status_messages = []
 
138
  progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
139
  progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
140
 
141
+ # Extract and sanitize filename
142
  parsed_url = urlparse(pdf_url)
143
  filename = os.path.basename(parsed_url.path)
144
 
145
  if not filename or filename == '.pdf':
146
  filename = f"document_{idx + 1}.pdf"
147
 
148
+ # Remove invalid characters
149
  filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
150
+ if not filename.endswith('.pdf'):
151
+ filename += '.pdf'
152
 
153
+ filepath = os.path.join(temp_dir, filename)
 
154
 
155
+ # Download with retry logic
156
+ max_retries = 3
157
+ for attempt in range(max_retries):
158
+ try:
159
+ pdf_response = session.get(pdf_url, timeout=60, stream=True)
160
+ pdf_response.raise_for_status()
161
+
162
+ # Verify it's actually a PDF
163
+ content_type = pdf_response.headers.get('content-type', '').lower()
164
+ if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
165
+ raise Exception(f"Not a PDF file (content-type: {content_type})")
166
+
167
+ # Save PDF
168
+ with open(filepath, 'wb') as f:
169
+ for chunk in pdf_response.iter_content(chunk_size=8192):
170
+ if chunk:
171
+ f.write(chunk)
172
+
173
+ # Verify file was written and has content
174
+ if os.path.getsize(filepath) == 0:
175
+ raise Exception("Downloaded file is empty")
176
+
177
+ successful += 1
178
+ break # Success, exit retry loop
179
+
180
+ except Exception as e:
181
+ if attempt == max_retries - 1:
182
+ raise # Last attempt failed
183
+ time.sleep(2) # Wait before retry
184
 
185
+ # Be polite - delay between downloads
186
  time.sleep(1)
187
 
188
  except Exception as e:
189
  failed += 1
190
  status_messages.append(f"Failed: {filename} - {str(e)}")
191
+ # Clean up failed download
192
+ if os.path.exists(filepath):
193
+ os.remove(filepath)
194
  continue
195
 
196
  if successful == 0:
197
+ return None, f"❌ Failed to download any PDFs.\n\n**Possible reasons:**\n- PDFs require authentication\n- Links are not direct PDF URLs\n- Website blocking automated downloads\n\n**Failed attempts:** {len(status_messages)}\n**Sample errors:**\n" + "\n".join(status_messages[:3])
198
 
199
  # Create zip file
200
  progress(0.9, desc="Creating zip file...")
 
206
  file_path = os.path.join(root, file)
207
  zipf.write(file_path, os.path.basename(file_path))
208
 
209
+ # Clean up
210
  shutil.rmtree(temp_dir)
211
 
212
+ # Create status message
213
  status_msg = f"✅ **Download Complete!**\n\n"
214
  status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
215
  status_msg += f"- **Successfully downloaded:** {successful}\n"
 
217
  status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
218
 
219
  if failed > 0 and len(status_messages) > 0:
220
+ status_msg += "\n**Failed Downloads (first 10):**\n" + "\n".join(status_messages[:10])
221
+ if len(status_messages) > 10:
222
+ status_msg += f"\n... and {len(status_messages) - 10} more"
223
 
224
  progress(1.0, desc="Complete!")
225
  return zip_path, status_msg
 
231
  shutil.rmtree(temp_dir)
232
  return None, f"❌ Unexpected error: {str(e)}"
233
 
234
+ # Gradio interface remains the same...