Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

App Files Files Community

bluenevus commited on Oct 9

Commit

3d64e96

verified ·

1 Parent(s): fabcdae

Create app.py

Browse files

Files changed (1) hide show

app.py +202 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import os
+from urllib.parse import urljoin, urlparse
+import time
+import zipfile
+import shutil
+from datetime import datetime
+def download_and_zip_pdfs(url, progress=gr.Progress()):
+    """
+    Download all PDFs from a given URL and create a zip file.
+    Args:
+        url: The webpage URL to scrape
+        progress: Gradio progress tracker
+    Returns:
+        tuple: (zip_file_path, status_message)
+    """
+    # Create unique temporary directory for this session
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    temp_dir = f"temp_pdfs_{timestamp}"
+    zip_filename = f"downloaded_pdfs_{timestamp}.zip"
+    try:
+        # Create temporary directory
+        os.makedirs(temp_dir, exist_ok=True)
+        progress(0, desc="Initializing...")
+        # Validate URL
+        if not url.startswith(('http://', 'https://')):
+            return None, "❌ Error: Please provide a valid URL starting with http:// or https://"
+        # Set headers
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        # Fetch webpage
+        progress(0.1, desc="Fetching webpage...")
+        try:
+            response = requests.get(url, headers=headers, timeout=30)
+            response.raise_for_status()
+        except requests.exceptions.RequestException as e:
+            return None, f"❌ Error fetching webpage: {str(e)}"
+        # Parse HTML
+        progress(0.2, desc="Parsing HTML...")
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Find all PDF links
+        all_links = soup.find_all('a', href=True)
+        pdf_links = []
+        for link in all_links:
+            href = link['href']
+            if href.lower().endswith('.pdf'):
+                full_url = urljoin(url, href)
+                pdf_links.append(full_url)
+        if len(pdf_links) == 0:
+            return None, "⚠️ No PDF files found on this page."
+        progress(0.3, desc=f"Found {len(pdf_links)} PDFs. Starting downloads...")
+        # Download PDFs
+        successful = 0
+        failed = 0
+        status_messages = []
+        for idx, pdf_url in enumerate(pdf_links):
+            try:
+                # Update progress
+                progress_value = 0.3 + (0.6 * (idx / len(pdf_links)))
+                progress(progress_value, desc=f"Downloading PDF {idx + 1}/{len(pdf_links)}...")
+                # Extract filename
+                parsed_url = urlparse(pdf_url)
+                filename = os.path.basename(parsed_url.path)
+                # Handle empty or invalid filenames
+                if not filename or filename == '.pdf':
+                    filename = f"document_{idx + 1}.pdf"
+                filepath = os.path.join(temp_dir, filename)
+                # Download PDF
+                pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
+                pdf_response.raise_for_status()
+                # Save PDF
+                with open(filepath, 'wb') as f:
+                    f.write(pdf_response.content)
+                successful += 1
+                # Small delay to be polite
+                time.sleep(0.5)
+            except Exception as e:
+                failed += 1
+                status_messages.append(f"Failed: {filename} - {str(e)}")
+                continue
+        if successful == 0:
+            return None, "❌ Failed to download any PDFs. Please check the URL and try again."
+        # Create zip file
+        progress(0.9, desc="Creating zip file...")
+        zip_path = os.path.join(os.getcwd(), zip_filename)
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for root, dirs, files in os.walk(temp_dir):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.basename(file_path)
+                    zipf.write(file_path, arcname)
+        # Clean up temporary directory
+        shutil.rmtree(temp_dir)
+        # Create status message
+        status_msg = f"✅ **Download Complete!**\n\n"
+        status_msg += f"- **Total PDFs found:** {len(pdf_links)}\n"
+        status_msg += f"- **Successfully downloaded:** {successful}\n"
+        status_msg += f"- **Failed:** {failed}\n"
+        status_msg += f"- **Zip file size:** {os.path.getsize(zip_path) / (1024*1024):.2f} MB\n\n"
+        if failed > 0 and len(status_messages) > 0:
+            status_msg += "\n**Failed Downloads:**\n" + "\n".join(status_messages[:10])
+            if len(status_messages) > 10:
+                status_msg += f"\n... and {len(status_messages) - 10} more"
+        progress(1.0, desc="Complete!")
+        return zip_path, status_msg
+    except Exception as e:
+        # Clean up on error
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        return None, f"❌ Unexpected error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="PDF Downloader & Zipper", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 📥 PDF Downloader & Zipper
+    Enter a URL to download all PDF files from that webpage and receive them as a single zip file.
+    **Instructions:**
+    1. Paste the URL of the webpage containing PDFs
+    2. Click "Download & Zip PDFs"
+    3. Wait for processing (may take several minutes for many PDFs)
+    4. Download your zip file!
+    """)
+    with gr.Row():
+        url_input = gr.Textbox(
+            label="Enter Webpage URL",
+            placeholder="https://www.example.com/documents/",
+            lines=1,
+            scale=4
+        )
+    with gr.Row():
+        submit_btn = gr.Button("📥 Download & Zip PDFs", variant="primary", scale=1)
+        clear_btn = gr.Button("🔄 Clear", scale=1)
+    status_output = gr.Markdown(label="Status")
+    download_output = gr.File(label="Download Zip File")
+    # Examples
+    gr.Markdown("### Example URLs:")
+    gr.Examples(
+        examples=[
+            ["https://www.esd.whs.mil/Directives/issuances/dodi/"],
+        ],
+        inputs=url_input
+    )
+    # Event handlers
+    submit_btn.click(
+        fn=download_and_zip_pdfs,
+        inputs=[url_input],
+        outputs=[download_output, status_output]
+    )
+    clear_btn.click(
+        fn=lambda: (None, None, ""),
+        inputs=None,
+        outputs=[download_output, url_input, status_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()