Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

File size: 8,935 Bytes

3d64e96
 
 
 
 
 
d23df15
3d64e96
decbda0
9331adf
373964e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
decbda0
9331adf
373964e
d23df15
 
373964e
d23df15
 
 
decbda0
9331adf
d23df15
 
 
 
3d64e96
 
373964e
 
 
d23df15
373964e
 
d23df15
373964e
d23df15
373964e
 
 
 
 
 
 
 
 
 
 
3d64e96
373964e
 
d23df15
373964e
 
d23df15
373964e
 
 
d23df15
 
373964e
3d64e96
 
d23df15
3d64e96
373964e
3d64e96
 
d23df15
 
3d64e96
373964e
 
 
 
d23df15
3d64e96
decbda0
 
373964e
 
decbda0
 
 
373964e
 
3d64e96
d23df15
 
 
 
373964e
 
 
 
 
 
d23df15
 
 
3d64e96
d23df15
373964e
3d64e96
 
 
d23df15
3d64e96
 
373964e
d23df15
 
 
 
373964e
 
d23df15
 
 
 
 
 
373964e
d23df15
 
 
 
373964e
decbda0
3d64e96
decbda0
 
 
 
 
 
 
373964e
decbda0
 
 
 
3d64e96
d23df15
 
3d64e96
 
 
d23df15
 
 
 
 
373964e
 
d23df15
 
373964e
decbda0
373964e
 
d23df15
 
 
 
 
 
373964e
 
d23df15
 
 
 
 
 
 
decbda0
d23df15
 
 
 
decbda0
d23df15
 
 
 
 
 
373964e
 
 
 
 
 
d23df15
 
fd46c1d
d23df15
fd46c1d
 
d23df15

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import zipfile
import tempfile
import shutil
import gradio as gr

def extract_detail_page_links(url, headers):
    """
    Extract all detail page links from the main listing page.
    
    Args:
        url: Main page URL
        headers: Request headers
    
    Returns:
        list of detail page URLs
    """
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    detail_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Look for detail page patterns (adjust pattern as needed)
        if 'Details.aspx' in href or 'PUB_ID=' in href:
            full_url = urljoin(url, href)
            if full_url not in detail_links:
                detail_links.append(full_url)
    
    return detail_links

def extract_pdf_links_from_page(url, headers):
    """
    Extract PDF links from a single page.
    
    Args:
        url: Page URL to scrape
        headers: Request headers
    
    Returns:
        list of PDF URLs
    """
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        pdf_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '.pdf' in href.lower():
                full_url = urljoin(url, href)
                if full_url not in pdf_links:
                    pdf_links.append(full_url)
        
        return pdf_links
    except Exception as e:
        print(f"Error extracting PDFs from {url}: {str(e)}")
        return []

def download_pdfs_from_page(url, progress=gr.Progress()):
    """
    Download all PDFs from a webpage by navigating through detail pages.
    
    Args:
        url: The main webpage URL to scrape
        progress: Gradio progress tracker
    
    Returns:
        tuple of (zip_file_path, summary_message)
    """
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        # Step 1: Extract detail page links from main page
        progress(0, desc="Fetching main page and extracting detail links...")
        detail_page_links = extract_detail_page_links(url, headers)
        
        if len(detail_page_links) == 0:
            return None, "❌ No detail page links found on the main page."
        
        progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")
        
        # Step 2: Visit each detail page and collect PDF links
        all_pdf_links = []
        for idx, detail_url in enumerate(detail_page_links, 1):
            progress(0.1 + (0.3 * idx / len(detail_page_links)), 
                    desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")
            
            pdf_links = extract_pdf_links_from_page(detail_url, headers)
            all_pdf_links.extend(pdf_links)
            
            # Be polite - small delay between page requests
            time.sleep(0.5)
        
        # Remove duplicates
        all_pdf_links = list(set(all_pdf_links))
        
        if len(all_pdf_links) == 0:
            return None, f"❌ No PDF links found across {len(detail_page_links)} detail pages."
        
        progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")
        
        # Step 3: Create temporary directory for downloads
        temp_dir = tempfile.mkdtemp()
        
        # Step 4: Download each PDF
        successful = 0
        failed = 0
        failed_urls = []
        
        for idx, pdf_url in enumerate(all_pdf_links, 1):
            try:
                parsed_url = urlparse(pdf_url)
                path_without_query = parsed_url.path
                filename = os.path.basename(path_without_query)
                
                # Handle empty filenames
                if not filename or filename == '':
                    filename = f"document_{idx}.pdf"
                
                filepath = os.path.join(temp_dir, filename)
                
                # Skip if file already exists
                if os.path.exists(filepath):
                    progress(0.4 + (0.5 * idx / len(all_pdf_links)), 
                            desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
                    successful += 1
                    continue
                
                progress(0.4 + (0.5 * idx / len(all_pdf_links)), 
                        desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")
                
                # Download PDF
                pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
                pdf_response.raise_for_status()
                
                # Verify it's actually a PDF
                if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
                    failed += 1
                    failed_urls.append(f"{filename}: Not a valid PDF file")
                    continue
                
                # Save PDF
                with open(filepath, 'wb') as f:
                    f.write(pdf_response.content)
                
                successful += 1
                time.sleep(1)  # Be polite
                
            except Exception as e:
                failed += 1
                failed_urls.append(f"{filename}: {str(e)}")
                continue
        
        # Step 5: Generate summary
        summary = f"""
✅ **Download Complete!**

📊 **Summary:**
- Detail pages scanned: {len(detail_page_links)}
- Total PDFs found: {len(all_pdf_links)}
- Successfully downloaded: {successful}
- Failed: {failed}
"""
        
        if failed > 0:
            summary += f"\n\n⚠️ **Failed Downloads:**\n"
            for fail in failed_urls[:10]:
                summary += f"- {fail}\n"
            if len(failed_urls) > 10:
                summary += f"- ... and {len(failed_urls) - 10} more\n"
        
        # Step 6: Create zip file
        progress(0.9, desc="Creating zip file...")
        
        zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(temp_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, arcname=file)
        
        # Clean up
        shutil.rmtree(temp_dir)
        
        progress(1.0, desc="Complete!")
        return zip_path, summary
        
    except requests.exceptions.RequestException as e:
        return None, f"❌ Error fetching webpage: {str(e)}"
    except Exception as e:
        return None, f"❌ Unexpected error: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # 📥 Two-Level PDF Downloader
            Download all PDFs from webpages with intermediate detail pages!
            
            **Instructions:**
            1. Enter the URL of the main listing page
            2. Click "Download PDFs"
            3. The tool will navigate through all detail pages
            4. Download your ZIP file with all PDFs
            """
        )
        
        with gr.Row():
            with gr.Column():
                url_input = gr.Textbox(
                    label="Main Page URL",
                    placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
                    lines=1
                )
                
                download_btn = gr.Button("📥 Download PDFs", variant="primary", size="lg")
        
        with gr.Row():
            with gr.Column():
                output_file = gr.File(label="Download ZIP")
                summary_output = gr.Markdown(label="Summary")
        
        download_btn.click(
            fn=download_pdfs_from_page,
            inputs=[url_input],
            outputs=[output_file, summary_output]
        )
        
        gr.Markdown(
            """
            ---
            ### 💡 Features:
            - **Two-level navigation**: Scans main page → visits detail pages → downloads PDFs
            - **Duplicate removal**: Ensures each PDF is downloaded only once
            - **Polite scraping**: Includes delays between requests
            - **Error handling**: Continues even if some downloads fail
            - **Progress tracking**: Real-time updates on scanning and downloading
            """
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)