Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

App Files Files Community

bluenevus commited on Oct 9

Commit

9331adf

verified ·

1 Parent(s): 87e5c3e

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -55

app.py CHANGED Viewed

@@ -7,72 +7,150 @@ import time
 import zipfile
 import shutil
 from datetime import datetime
-def download_and_zip_pdfs(url, progress=gr.Progress()):
     """
-    Download all PDFs from a given URL and create a zip file.
-    Enhanced with better headers to avoid 403 errors.
     """
-    # Create unique temporary directory for this session
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     temp_dir = f"temp_pdfs_{timestamp}"
     zip_filename = f"downloaded_pdfs_{timestamp}.zip"
     try:
         os.makedirs(temp_dir, exist_ok=True)
-        progress(0, desc="Initializing...")
         if not url.startswith(('http://', 'https://')):
             return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
-        # Enhanced headers to mimic a real browser
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-Fetch-Mode': 'navigate',
-            'Sec-Fetch-Site': 'none',
-            'Sec-Fetch-User': '?1',
-            'Cache-Control': 'max-age=0',
-        }
-        # Create a session to maintain cookies
-        session = requests.Session()
-        session.headers.update(headers)
-        # Fetch webpage with session
-        progress(0.1, desc="Fetching webpage...")
         try:
-            response = session.get(url, timeout=30, allow_redirects=True)
-            response.raise_for_status()
-        except requests.exceptions.RequestException as e:
-            return None, f"❌ Error fetching webpage: {str(e)}\n\nTip: The website may be blocking automated requests. Try using Solution 2 with Selenium."
-        # Parse HTML
-        progress(0.2, desc="Parsing HTML...")
-        soup = BeautifulSoup(response.content, 'html.parser')
         # Find all PDF links
-        all_links = soup.find_all('a', href=True)
-        pdf_links = []
-        for link in all_links:
-            href = link['href']
-            if href.lower().endswith('.pdf'):
-                full_url = urljoin(url, href)
-                pdf_links.append(full_url)
         if len(pdf_links) == 0:
             return None, "⚠️ No PDF files found on this page."
         progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
-        # Download PDFs using the same session
         successful = 0
         failed = 0
         status_messages = []
@@ -82,26 +160,30 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
                 progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
                 progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
                 parsed_url = urlparse(pdf_url)
                 filename = os.path.basename(parsed_url.path)
                 if not filename or filename == '.pdf':
                     filename = f"document_{idx + 1}.pdf"
                 filepath = os.path.join(temp_dir, filename)
-                # Download PDF with session
                 pdf_response = session.get(pdf_url, timeout=60, stream=True)
                 pdf_response.raise_for_status()
                 # Save PDF
                 with open(filepath, 'wb') as f:
                     for chunk in pdf_response.iter_content(chunk_size=8192):
-                        f.write(chunk)
                 successful += 1
-                # Polite delay
                 time.sleep(1)
             except Exception as e:
@@ -110,7 +192,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
                 continue
         if successful == 0:
-            return None, "❌ Failed to download any PDFs. The website may have strong anti-bot protection."
         # Create zip file
         progress(0.9, desc="Creating zip file...")
@@ -123,7 +205,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
                     arcname = os.path.basename(file_path)
                     zipf.write(file_path, arcname)
-        # Clean up
         shutil.rmtree(temp_dir)
         # Create status message
@@ -132,6 +214,7 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
         status_msg += f"- **Successfully downloaded:** {successful}\n"
         status_msg += f"- **Failed:** {failed}\n"
         status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
         if failed > 0 and len(status_messages) > 0:
             status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
@@ -143,39 +226,48 @@ def download_and_zip_pdfs(url, progress=gr.Progress()):
         return zip_path, status_msg
     except Exception as e:
         if os.path.exists(temp_dir):
             shutil.rmtree(temp_dir)
         return None, f"❌ Unexpected error: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 📥 PDF Downloader & Zipper
-    Enter a URL to download all PDF files from that webpage and receive them as a single zip file.
     **Instructions:**
     1. Paste the URL of the webpage containing PDFs
     2. Click "Download & Zip PDFs"
-    3. Wait for processing (may take several minutes for many PDFs)
     4. Download your zip file!
     """)
     with gr.Row():
         url_input = gr.Textbox(
             label="Enter Webpage URL",
-            placeholder="https://www.example.com/documents/",
             lines=1,
             scale=4
         )
     with gr.Row():
-        submit_btn = gr.Button("📥 Download & Zip PDFs", variant="primary", scale=1)
         clear_btn = gr.Button("🔄 Clear", scale=1)
     status_output = gr.Markdown(label="Status")
     download_output = gr.File(label="Download Zip File")
     # Examples
@@ -189,7 +281,7 @@ with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
     # Event handlers
     submit_btn.click(
-        fn=download_and_zip_pdfs,
         inputs=[url_input],
         outputs=[download_output, status_output]
     )
@@ -200,6 +292,5 @@ with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
         outputs=[download_output, url_input, status_output]
     )
-# Launch the app
 if __name__ == "__main__":
     demo.launch()

 import zipfile
 import shutil
 from datetime import datetime
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+def setup_selenium_driver():
     """
+    Configure and return a Selenium WebDriver optimized for Hugging Face.
     """
+    chrome_options = Options()
+    # Essential options for cloud environments
+    chrome_options.add_argument('--headless')
+    chrome_options.add_argument('--no-sandbox')
+    chrome_options.add_argument('--disable-dev-shm-usage')
+    chrome_options.add_argument('--disable-gpu')
+    chrome_options.add_argument('--disable-software-rasterizer')
+    # Anti-detection options
+    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    chrome_options.add_experimental_option('useAutomationExtension', False)
+    # Mimic real browser
+    chrome_options.add_argument('--window-size=1920,1080')
+    chrome_options.add_argument('--start-maximized')
+    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
+    # Additional privacy/security settings
+    chrome_options.add_argument('--disable-extensions')
+    chrome_options.add_argument('--disable-infobars')
+    chrome_options.add_argument('--disable-notifications')
+    # Performance optimizations
+    chrome_options.add_argument('--disable-images')  # Faster loading
+    chrome_options.page_load_strategy = 'eager'  # Don't wait for all resources
+    try:
+        service = Service(ChromeDriverManager().install())
+        driver = webdriver.Chrome(service=service, options=chrome_options)
+        # Additional anti-detection JavaScript
+        driver.execute_cdp_cmd('Network.setUserAgentOverride', {
+            "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
+        })
+        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+        return driver
+    except Exception as e:
+        raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
+def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
+    """
+    Download all PDFs using Selenium WebDriver and create a zip file.
+    """
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     temp_dir = f"temp_pdfs_{timestamp}"
     zip_filename = f"downloaded_pdfs_{timestamp}.zip"
+    driver = None
     try:
         os.makedirs(temp_dir, exist_ok=True)
+        progress(0, desc="Initializing browser...")
+        # Validate URL
         if not url.startswith(('http://', 'https://')):
             return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
+        # Initialize Selenium driver
         try:
+            driver = setup_selenium_driver()
+        except Exception as e:
+            return None, f"❌ Error initializing browser: {str(e)}"
+        # Navigate to the page
+        progress(0.1, desc="Loading webpage with browser...")
+        try:
+            driver.get(url)
+            # Wait for page to load (adjust timeout as needed)
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.TAG_NAME, "body"))
+            )
+            # Additional wait for dynamic content
+            time.sleep(3)
+        except Exception as e:
+            if driver:
+                driver.quit()
+            return None, f"❌ Error loading webpage: {str(e)}"
         # Find all PDF links
+        progress(0.2, desc="Finding PDF links...")
+        try:
+            # Get page source and parse with BeautifulSoup
+            page_source = driver.page_source
+            soup = BeautifulSoup(page_source, 'html.parser')
+            # Find all links
+            all_links = soup.find_all('a', href=True)
+            pdf_links = []
+            for link in all_links:
+                href = link['href']
+                if href.lower().endswith('.pdf'):
+                    full_url = urljoin(url, href)
+                    pdf_links.append(full_url)
+            # Alternative: Use Selenium directly to find PDF links
+            if len(pdf_links) == 0:
+                pdf_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
+                pdf_links = [elem.get_attribute('href') for elem in pdf_elements if elem.get_attribute('href')]
+        except Exception as e:
+            if driver:
+                driver.quit()
+            return None, f"❌ Error finding PDF links: {str(e)}"
+        # Close the browser - we don't need it anymore
+        driver.quit()
+        driver = None
         if len(pdf_links) == 0:
             return None, "⚠️ No PDF files found on this page."
         progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
+        # Download PDFs using requests with enhanced headers
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+            'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Referer': url,
+        }
+        session = requests.Session()
+        session.headers.update(headers)
         successful = 0
         failed = 0
         status_messages = []
                 progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
                 progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
+                # Extract filename
                 parsed_url = urlparse(pdf_url)
                 filename = os.path.basename(parsed_url.path)
                 if not filename or filename == '.pdf':
                     filename = f"document_{idx + 1}.pdf"
+                # Sanitize filename
+                filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
                 filepath = os.path.join(temp_dir, filename)
+                # Download PDF
                 pdf_response = session.get(pdf_url, timeout=60, stream=True)
                 pdf_response.raise_for_status()
                 # Save PDF
                 with open(filepath, 'wb') as f:
                     for chunk in pdf_response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
                 successful += 1
+                # Be polite - delay between downloads
                 time.sleep(1)
             except Exception as e:
                 continue
         if successful == 0:
+            return None, "❌ Failed to download any PDFs. Please check the URL and try again."
         # Create zip file
         progress(0.9, desc="Creating zip file...")
                     arcname = os.path.basename(file_path)
                     zipf.write(file_path, arcname)
+        # Clean up temporary directory
         shutil.rmtree(temp_dir)
         # Create status message
         status_msg += f"- **Successfully downloaded:** {successful}\n"
         status_msg += f"- **Failed:** {failed}\n"
         status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
+        status_msg += f"**Method:** Selenium WebDriver (bypasses most anti-bot protection)\n"
         if failed > 0 and len(status_messages) > 0:
             status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
         return zip_path, status_msg
     except Exception as e:
+        if driver:
+            driver.quit()
         if os.path.exists(temp_dir):
             shutil.rmtree(temp_dir)
         return None, f"❌ Unexpected error: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="PDF Downloader & Zipper (Selenium)", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # 📥 PDF Downloader & Zipper (Selenium WebDriver)
+    This version uses **Selenium WebDriver** to bypass anti-bot protection and access websites that block cloud platforms.
+    **How it works:**
+    - Uses a real Chrome browser (headless) to load the page
+    - Mimics human browsing behavior
+    - Bypasses most IP-based blocking and anti-bot measures
+    - Works with government sites and other restricted sources
     **Instructions:**
     1. Paste the URL of the webpage containing PDFs
     2. Click "Download & Zip PDFs"
+    3. Wait for processing (first run may take longer to download Chrome driver)
     4. Download your zip file!
+    ⚠️ **Note:** First run will download ChromeDriver (~10MB) - this is normal and only happens once.
     """)
     with gr.Row():
         url_input = gr.Textbox(
             label="Enter Webpage URL",
+            placeholder="https://www.esd.whs.mil/Directives/issuances/dodi/",
             lines=1,
             scale=4
         )
     with gr.Row():
+        submit_btn = gr.Button("📥 Download & Zip PDFs (Selenium)", variant="primary", scale=1)
         clear_btn = gr.Button("🔄 Clear", scale=1)
     status_output = gr.Markdown(label="Status")
     download_output = gr.File(label="Download Zip File")
     # Examples
     # Event handlers
     submit_btn.click(
+        fn=download_and_zip_pdfs_selenium,
         inputs=[url_input],
         outputs=[download_output, status_output]
     )
         outputs=[download_output, url_input, status_output]
     )
 if __name__ == "__main__":
     demo.launch()