Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

App Files Files Community

bluenevus commited on Oct 9

Commit

d23df15

verified ·

1 Parent(s): 3115e99

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -216

app.py CHANGED Viewed

@@ -1,253 +1,236 @@
-import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import os
 from urllib.parse import urljoin, urlparse
 import time
 import zipfile
 import shutil
-from datetime import datetime
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-def setup_selenium_driver():
-    """Configure Selenium WebDriver for Hugging Face Spaces."""
-    chrome_options = Options()
-    chrome_options.add_argument('--headless')
-    chrome_options.add_argument('--no-sandbox')
-    chrome_options.add_argument('--disable-dev-shm-usage')
-    chrome_options.add_argument('--disable-gpu')
-    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-    chrome_options.add_experimental_option('useAutomationExtension', False)
-    chrome_options.add_argument('--window-size=1920,1080')
-    chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
-    chrome_options.binary_location = '/usr/bin/chromium'
-    try:
-        service = Service('/usr/bin/chromedriver')
-        driver = webdriver.Chrome(service=service, options=chrome_options)
-        driver.execute_cdp_cmd('Network.setUserAgentOverride', {
-            "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
-        })
-        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
-        return driver
-    except Exception as e:
-        raise Exception(f"Failed to initialize Chrome driver: {str(e)}")
-def download_and_zip_pdfs_selenium(url, progress=gr.Progress()):
     """
-    Download all PDFs using Selenium WebDriver with proper session management.
     """
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    temp_dir = f"temp_pdfs_{timestamp}"
-    zip_filename = f"downloaded_pdfs_{timestamp}.zip"
-    driver = None
     try:
-        os.makedirs(temp_dir, exist_ok=True)
-        progress(0, desc="Initializing browser...")
-        if not url.startswith(('http://', 'https://')):
-            return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
-        # Initialize Selenium driver
-        try:
-            driver = setup_selenium_driver()
-        except Exception as e:
-            return None, f"❌ Error initializing browser: {str(e)}"
-        # Navigate to the page
-        progress(0.1, desc="Loading webpage with browser...")
-        try:
-            driver.get(url)
-            WebDriverWait(driver, 20).until(
-                EC.presence_of_element_located((By.TAG_NAME, "body"))
-            )
-            time.sleep(3)
-        except Exception as e:
-            if driver:
-                driver.quit()
-            return None, f"❌ Error loading webpage: {str(e)}"
-        # Find all PDF links
-        progress(0.2, desc="Finding PDF links...")
-        try:
-            page_source = driver.page_source
-            soup = BeautifulSoup(page_source, 'html.parser')
-            all_links = soup.find_all('a', href=True)
-            pdf_links = []
-            for link in all_links:
-                href = link['href']
-                if href.lower().endswith('.pdf'):
-                    full_url = urljoin(url, href)
-                    pdf_links.append(full_url)
-            if len(pdf_links) == 0:
-                pdf_elements = driver.find_elements(By.XPATH, "//a[contains(@href, '.pdf')]")
-                pdf_links = [elem.get_attribute('href') for elem in pdf_elements if elem.get_attribute('href')]
-        except Exception as e:
-            if driver:
-                driver.quit()
-            return None, f"❌ Error finding PDF links: {str(e)}"
         if len(pdf_links) == 0:
-            if driver:
-                driver.quit()
-            return None, "⚠️ No PDF files found on this page."
-        # **KEY FIX: Transfer Selenium cookies to requests session**
-        progress(0.25, desc="Transferring session cookies...")
-        session = requests.Session()
-        # Copy all cookies from Selenium to requests
-        for cookie in driver.get_cookies():
-            session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
-        # Copy headers from Selenium
-        headers = {
-            'User-Agent': driver.execute_script("return navigator.userAgent;"),
-            'Accept': 'application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.9',
-            'Accept-Encoding': 'gzip, deflate, br',
-            'Connection': 'keep-alive',
-            'Referer': url,
-        }
-        session.headers.update(headers)
-        # Now we can close the browser
-        driver.quit()
-        driver = None
-        progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
-        # Download PDFs with the authenticated session
         successful = 0
         failed = 0
-        status_messages = []
-        for idx, pdf_url in enumerate(pdf_links):
             try:
-                progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
-                progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
-                # Extract and sanitize filename
                 parsed_url = urlparse(pdf_url)
-                filename = os.path.basename(parsed_url.path)
-                if not filename or filename == '.pdf':
-                    filename = f"document_{idx + 1}.pdf"
-                # Remove invalid characters
-                filename = "".join(c for c in filename if c.isalnum() or c in (' ', '.', '_', '-')).rstrip()
-                if not filename.endswith('.pdf'):
-                    filename += '.pdf'
-                filepath = os.path.join(temp_dir, filename)
-                # Download with retry logic
-                max_retries = 3
-                for attempt in range(max_retries):
-                    try:
-                        pdf_response = session.get(pdf_url, timeout=60, stream=True)
-                        pdf_response.raise_for_status()
-                        # Verify it's actually a PDF
-                        content_type = pdf_response.headers.get('content-type', '').lower()
-                        if 'pdf' not in content_type and 'application/octet-stream' not in content_type:
-                            raise Exception(f"Not a PDF file (content-type: {content_type})")
-                        # Save PDF
-                        with open(filepath, 'wb') as f:
-                            for chunk in pdf_response.iter_content(chunk_size=8192):
-                                if chunk:
-                                    f.write(chunk)
-                        # Verify file was written and has content
-                        if os.path.getsize(filepath) == 0:
-                            raise Exception("Downloaded file is empty")
-                        successful += 1
-                        break  # Success, exit retry loop
-                    except Exception as e:
-                        if attempt == max_retries - 1:
-                            raise  # Last attempt failed
-                        time.sleep(2)  # Wait before retry
-                # Be polite - delay between downloads
-                time.sleep(1)
             except Exception as e:
                 failed += 1
-                status_messages.append(f"Failed: {filename} - {str(e)}")
-                # Clean up failed download
-                if os.path.exists(filepath):
-                    os.remove(filepath)
                 continue
-        if successful == 0:
-            return None, f"❌ Failed to download any PDFs.\n\n**Possible reasons:**\n- PDFs require authentication\n- Links are not direct PDF URLs\n- Website blocking automated downloads\n\n**Failed attempts:** {len(status_messages)}\n**Sample errors:**\n" + "\n".join(status_messages[:3])
-        # Create zip file
-        progress(0.9, desc="Creating zip file...")
-        zip_path = os.path.join(os.getcwd(), zip_filename)
-        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-            for root, dirs, files in os.walk(temp_dir):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    zipf.write(file_path, os.path.basename(file_path))
-        # Clean up
-        shutil.rmtree(temp_dir)
-        # Create status message
-        status_msg = f"✅ **Download Complete!**\n\n"
-        status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
-        status_msg += f"- **Successfully downloaded:** {successful}\n"
-        status_msg += f"- **Failed:** {failed}\n"
-        status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n"
-        if failed > 0 and len(status_messages) > 0:
-            status_msg += "\n**Failed Downloads (first 10):**\n" + "\n".join(status_messages[:10])
-            if len(status_messages) > 10:
-                status_msg += f"\n... and {len(status_messages) - 10} more"
-        progress(1.0, desc="Complete!")
-        return zip_path, status_msg
     except Exception as e:
-        if driver:
-            driver.quit()
-        if os.path.exists(temp_dir):
-            shutil.rmtree(temp_dir)
         return None, f"❌ Unexpected error: {str(e)}"
-with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 📥 PDF Downloader & Zipper (Selenium)
-    Downloads all PDFs from a webpage using Selenium WebDriver.
-    """)
-    url_input = gr.Textbox(label="Enter Webpage URL", placeholder="https://example.com/pdfs/")
-    submit_btn = gr.Button("📥 Download & Zip PDFs", variant="primary")
-    status_output = gr.Markdown()
-    download_output = gr.File(label="Download Zip File")
-    submit_btn.click(
-        fn=download_and_zip_pdfs_selenium,
-        inputs=[url_input],
-        outputs=[download_output, status_output]
-    )
 if __name__ == "__main__":
-    demo.launch()

 import requests
 from bs4 import BeautifulSoup
 import os
 from urllib.parse import urljoin, urlparse
 import time
 import zipfile
+import io
+import gradio as gr
+from pathlib import Path
+import tempfile
 import shutil
+def download_pdfs_from_page(url, output_option="zip", output_dir='downloaded_pdfs', progress=gr.Progress()):
     """
+    Download all PDFs from a webpage.
+    Args:
+        url: The webpage URL to scrape
+        output_option: Either "zip" or "directory"
+        output_dir: Directory to save downloaded PDFs (only used if output_option is "directory")
+        progress: Gradio progress tracker
+    Returns:
+        For zip: tuple of (zip_file_path, summary_message)
+        For directory: tuple of (None, summary_message)
     """
+    # Set headers to mimic a browser request
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+    }
     try:
+        # Fetch the webpage
+        progress(0, desc="Fetching webpage...")
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        # Parse HTML
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Find all links
+        all_links = soup.find_all('a', href=True)
+        # Filter for PDF links (including those with query parameters)
+        pdf_links = []
+        for link in all_links:
+            href = link['href']
+            if '.pdf' in href.lower():
+                full_url = urljoin(url, href)
+                pdf_links.append(full_url)
         if len(pdf_links) == 0:
+            return None, "❌ No PDF links found on the page."
+        progress(0.1, desc=f"Found {len(pdf_links)} PDF links")
+        # Create temporary directory for downloads
+        temp_dir = tempfile.mkdtemp()
+        # Download each PDF
         successful = 0
         failed = 0
+        failed_urls = []
+        for idx, pdf_url in enumerate(pdf_links, 1):
             try:
+                # Extract filename from URL (remove query parameters)
                 parsed_url = urlparse(pdf_url)
+                path_without_query = parsed_url.path
+                filename = os.path.basename(path_without_query)
+                # Create full file path in temp directory
+                filepath = os.path.join(temp_dir, filename)
+                # Update progress
+                progress((0.1 + (0.8 * idx / len(pdf_links))),
+                        desc=f"Downloading {idx}/{len(pdf_links)}: {filename}")
+                # Download PDF
+                pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
+                pdf_response.raise_for_status()
+                # Save PDF
+                with open(filepath, 'wb') as f:
+                    f.write(pdf_response.content)
+                successful += 1
+                # Be polite - add a small delay between downloads
+                time.sleep(0.5)
             except Exception as e:
                 failed += 1
+                failed_urls.append(f"{filename}: {str(e)}")
                 continue
+        # Generate summary message
+        summary = f"""
+✅ **Download Complete!**
+📊 **Summary:**
+- Total PDFs found: {len(pdf_links)}
+- Successfully downloaded: {successful}
+- Failed: {failed}
+"""
+        if failed > 0:
+            summary += f"\n\n⚠️ **Failed Downloads:**\n"
+            for fail in failed_urls[:10]:  # Show first 10 failures
+                summary += f"- {fail}\n"
+            if len(failed_urls) > 10:
+                summary += f"- ... and {len(failed_urls) - 10} more\n"
+        # Handle output based on user choice
+        if output_option == "zip":
+            progress(0.9, desc="Creating zip file...")
+            # Create zip file
+            zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                for root, dirs, files in os.walk(temp_dir):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        zipf.write(file_path, arcname=file)
+            # Clean up temp directory
+            shutil.rmtree(temp_dir)
+            progress(1.0, desc="Complete!")
+            return zip_path, summary
+        else:  # output_option == "directory"
+            progress(0.9, desc="Copying files to output directory...")
+            # Create output directory if it doesn't exist
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            # Copy files from temp to output directory
+            for file in os.listdir(temp_dir):
+                shutil.copy2(os.path.join(temp_dir, file), os.path.join(output_dir, file))
+            # Clean up temp directory
+            shutil.rmtree(temp_dir)
+            summary += f"\n\n📁 **Files saved to:** `{os.path.abspath(output_dir)}`"
+            progress(1.0, desc="Complete!")
+            return None, summary
+    except requests.exceptions.RequestException as e:
+        return None, f"❌ Error fetching webpage: {str(e)}"
     except Exception as e:
         return None, f"❌ Unexpected error: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 📥 PDF Downloader
+            Download all PDFs from any webpage with ease!
+            **Instructions:**
+            1. Enter the URL of the webpage containing PDF links
+            2. Choose whether to download as a ZIP file or save to a directory
+            3. If saving to directory, specify the directory name
+            4. Click "Download PDFs"
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(
+                    label="Webpage URL",
+                    placeholder="https://example.com/pdfs",
+                    lines=1
+                )
+                output_option = gr.Radio(
+                    choices=["zip", "directory"],
+                    value="zip",
+                    label="Output Option",
+                    info="Choose how to receive the downloaded PDFs"
+                )
+                output_dir = gr.Textbox(
+                    label="Output Directory (only for 'directory' option)",
+                    placeholder="downloaded_pdfs",
+                    value="downloaded_pdfs",
+                    lines=1,
+                    visible=False
+                )
+                download_btn = gr.Button("📥 Download PDFs", variant="primary", size="lg")
+        with gr.Row():
+            with gr.Column():
+                output_file = gr.File(label="Download ZIP", visible=True)
+                summary_output = gr.Markdown(label="Summary")
+        # Show/hide directory input based on output option
+        def toggle_directory_input(option):
+            return gr.update(visible=(option == "directory"))
+        output_option.change(
+            fn=toggle_directory_input,
+            inputs=[output_option],
+            outputs=[output_dir]
+        )
+        # Handle download button click
+        download_btn.click(
+            fn=download_pdfs_from_page,
+            inputs=[url_input, output_option, output_dir],
+            outputs=[output_file, summary_output]
+        )
+        gr.Markdown(
+            """
+            ---
+            ### 💡 Tips:
+            - The script will find all PDF links on the page, including those with query parameters
+            - Downloads include a small delay between requests to be respectful to servers
+            - ZIP files are automatically named with a timestamp
+            - For directory output, files are saved to your local filesystem
+            """
+        )
+    return demo
+# Launch the interface
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(share=True)