Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

App Files Files Community

bluenevus commited on Oct 9

Commit

ea2d648

verified ·

1 Parent(s): 274f96e

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -67

app.py CHANGED Viewed

@@ -8,63 +8,40 @@ import zipfile
 import shutil
 from datetime import datetime
 from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 def setup_selenium_driver():
-    """
-    Configure Selenium WebDriver for Hugging Face Spaces.
-    Uses system-installed Chromium instead of webdriver-manager.
-    """
     chrome_options = Options()
-    # Essential options for cloud environments
     chrome_options.add_argument('--headless')
     chrome_options.add_argument('--no-sandbox')
     chrome_options.add_argument('--disable-dev-shm-usage')
     chrome_options.add_argument('--disable-gpu')
-    chrome_options.add_argument('--disable-software-rasterizer')
-    # Anti-detection options
     chrome_options.add_argument('--disable-blink-features=AutomationControlled')
     chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
     chrome_options.add_experimental_option('useAutomationExtension', False)
-    # Mimic real browser
     chrome_options.add_argument('--window-size=1920,1080')
     chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
-    # Performance optimizations
-    chrome_options.add_argument('--disable-extensions')
-    chrome_options.add_argument('--disable-infobars')
-    chrome_options.add_argument('--disable-notifications')
-    chrome_options.add_argument('--disable-images')
-    chrome_options.page_load_strategy = 'eager'
-    # Specify binary locations for Hugging Face
     chrome_options.binary_location = '/usr/bin/chromium'
     try:
-        # Use system ChromeDriver path
         service = Service('/usr/bin/chromedriver')
         driver = webdriver.Chrome(service=service, options=chrome_options)
-        # Additional anti-detection JavaScript
         driver.execute_cdp_cmd('Network.setUserAgentOverride', {
             "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
         })
         driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
         return driver
     except Exception as e:
         raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
 def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
     """
-    Download all PDFs using Selenium WebDriver and create a zip file.
     """
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     temp_dir = f"temp_pdfs_{timestamp}"
@@ -82,7 +59,7 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
         try:
             driver = setup_selenium_driver()
         except Exception as e:
-            return None, f"❌ Error initializing browser: {str(e)}\n\n**Troubleshooting:**\n- Ensure Chrome/Chromium is installed\n- Check if ChromeDriver is accessible\n- Verify Hugging Face Space has necessary packages"
         # Navigate to the page
         progress(0.1, desc="Loading webpage with browser...")
@@ -92,7 +69,6 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
                 EC.presence_of_element_located((By.TAG_NAME, "body"))
             )
             time.sleep(3)
         except Exception as e:
             if driver:
                 driver.quit()
@@ -122,25 +98,37 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
                 driver.quit()
             return None, f"❌ Error finding PDF links: {str(e)}"
-        driver.quit()
-        driver = None
         if len(pdf_links) == 0:
             return None, "⚠️ No PDF files found on this page."
-        progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
-        # Download PDFs
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
             'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
             'Referer': url,
         }
-        session = requests.Session()
         session.headers.update(headers)
         successful = 0
         failed = 0
         status_messages = []
@@ -150,33 +138,63 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
                 progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
                 progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
                 parsed_url = urlparse(pdf_url)
                 filename = os.path.basename(parsed_url.path)
                 if not filename or filename == '.pdf':
                     filename = f"document_{idx + 1}.pdf"
                 filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
-                filepath = os.path.join(temp_dir, filename)
-                pdf_response = session.get(pdf_url, timeout=60, stream=True)
-                pdf_response.raise_for_status()
-                with open(filepath, 'wb') as f:
-                    for chunk in pdf_response.iter_content(chunk_size=8192):
-                        if chunk:
-                            f.write(chunk)
-                successful += 1
                 time.sleep(1)
             except Exception as e:
                 failed += 1
                 status_messages.append(f"Failed: {filename} - {str(e)}")
                 continue
         if successful == 0:
-            return None, "❌ Failed to download any PDFs."
         # Create zip file
         progress(0.9, desc="Creating zip file...")
@@ -188,8 +206,10 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, os.path.basename(file_path))
         shutil.rmtree(temp_dir)
         status_msg = f"✅ **Download Complete!**\n\n"
         status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
         status_msg += f"- **Successfully downloaded:** {successful}\n"
@@ -197,7 +217,9 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
         status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
         if failed > 0 and len(status_messages) > 0:
-            status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
         progress(1.0, desc="Complete!")
         return zip_path, status_msg
@@ -209,24 +231,4 @@ def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
             shutil.rmtree(temp_dir)
         return None, f"❌ Unexpected error: {str(e)}"
-# Gradio interface (same as before)
-with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 📥 PDF Downloader & Zipper (Selenium)
-    Downloads all PDFs from a webpage using Selenium WebDriver.
-    """)
-    url_input = gr.Textbox(label="Enter Webpage URL", placeholder="https://example.com/pdfs/")
-    submit_btn = gr.Button("📥 Download & Zip PDFs", variant="primary")
-    status_output = gr.Markdown()
-    download_output = gr.File(label="Download Zip File")
-    submit_btn.click(
-        fn=download_and_zip_pdfs_selenium,
-        inputs=[url_input],
-        outputs=[download_output, status_output]
-    )
-if __name__ == "__main__":
-    demo.launch()

 import shutil
 from datetime import datetime
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 def setup_selenium_driver():
+    """Configure Selenium WebDriver for Hugging Face Spaces."""
     chrome_options = Options()
     chrome_options.add_argument('--headless')
     chrome_options.add_argument('--no-sandbox')
     chrome_options.add_argument('--disable-dev-shm-usage')
     chrome_options.add_argument('--disable-gpu')
     chrome_options.add_argument('--disable-blink-features=AutomationControlled')
     chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
     chrome_options.add_experimental_option('useAutomationExtension', False)
     chrome_options.add_argument('--window-size=1920,1080')
     chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
     chrome_options.binary_location = '/usr/bin/chromium'
     try:
         service = Service('/usr/bin/chromedriver')
         driver = webdriver.Chrome(service=service, options=chrome_options)
         driver.execute_cdp_cmd('Network.setUserAgentOverride', {
             "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
         })
         driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
         return driver
     except Exception as e:
         raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
 def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
     """
+    Download all PDFs using Selenium WebDriver with proper session management.
     """
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     temp_dir = f"temp_pdfs_{timestamp}"
         try:
             driver = setup_selenium_driver()
         except Exception as e:
+            return None, f"❌ Error initializing browser: {str(e)}"
         # Navigate to the page
         progress(0.1, desc="Loading webpage with browser...")
                 EC.presence_of_element_located((By.TAG_NAME, "body"))
             )
             time.sleep(3)
         except Exception as e:
             if driver:
                 driver.quit()
                 driver.quit()
             return None, f"❌ Error finding PDF links: {str(e)}"
         if len(pdf_links) == 0:
+            if driver:
+                driver.quit()
             return None, "⚠️ No PDF files found on this page."
+        # **KEY FIX: Transfer Selenium cookies to requests session**
+        progress(0.25, desc="Transferring session cookies...")
+        session = requests.Session()
+        # Copy all cookies from Selenium to requests
+        for cookie in driver.get_cookies():
+            session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
+        # Copy headers from Selenium
         headers = {
+            'User-Agent': driver.execute_script("return navigator.userAgent;"),
             'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
             'Referer': url,
         }
         session.headers.update(headers)
+        # Now we can close the browser
+        driver.quit()
+        driver = None
+        progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
+        # Download PDFs with the authenticated session
         successful = 0
         failed = 0
         status_messages = []
                 progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
                 progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
+                # Extract and sanitize filename
                 parsed_url = urlparse(pdf_url)
                 filename = os.path.basename(parsed_url.path)
                 if not filename or filename == '.pdf':
                     filename = f"document_{idx + 1}.pdf"
+                # Remove invalid characters
                 filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
+                if not filename.endswith('.pdf'):
+                    filename += '.pdf'
+                filepath = os.path.join(temp_dir, filename)
+                # Download with retry logic
+                max_retries = 3
+                for attempt in range(max_retries):
+                    try:
+                        pdf_response = session.get(pdf_url, timeout=60, stream=True)
+                        pdf_response.raise_for_status()
+                        # Verify it's actually a PDF
+                        content_type = pdf_response.headers.get('content-type', '').lower()
+                        if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
+                            raise Exception(f"Not a PDF file (content-type: {content_type})")
+                        # Save PDF
+                        with open(filepath, 'wb') as f:
+                            for chunk in pdf_response.iter_content(chunk_size=8192):
+                                if chunk:
+                                    f.write(chunk)
+                        # Verify file was written and has content
+                        if os.path.getsize(filepath) == 0:
+                            raise Exception("Downloaded file is empty")
+                        successful += 1
+                        break  # Success, exit retry loop
+                    except Exception as e:
+                        if attempt == max_retries - 1:
+                            raise  # Last attempt failed
+                        time.sleep(2)  # Wait before retry
+                # Be polite - delay between downloads
                 time.sleep(1)
             except Exception as e:
                 failed += 1
                 status_messages.append(f"Failed: {filename} - {str(e)}")
+                # Clean up failed download
+                if os.path.exists(filepath):
+                    os.remove(filepath)
                 continue
         if successful == 0:
+            return None, f"❌ Failed to download any PDFs.\n\n**Possible reasons:**\n- PDFs require authentication\n- Links are not direct PDF URLs\n- Website blocking automated downloads\n\n**Failed attempts:** {len(status_messages)}\n**Sample errors:**\n" + "\n".join(status_messages[:3])
         # Create zip file
         progress(0.9, desc="Creating zip file...")
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, os.path.basename(file_path))
+        # Clean up
         shutil.rmtree(temp_dir)
+        # Create status message
         status_msg = f"✅ **Download Complete!**\n\n"
         status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
         status_msg += f"- **Successfully downloaded:** {successful}\n"
         status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
         if failed > 0 and len(status_messages) > 0:
+            status_msg += "\n**Failed Downloads (first 10):**\n" + "\n".join(status_messages[:10])
+            if len(status_messages) > 10:
+                status_msg += f"\n... and {len(status_messages) - 10} more"
         progress(1.0, desc="Complete!")
         return zip_path, status_msg
             shutil.rmtree(temp_dir)
         return None, f"❌ Unexpected error: {str(e)}"
+# Gradio interface remains the same...