Spaces:
Sleeping
Sleeping
File size: 8,935 Bytes
3d64e96 d23df15 3d64e96 decbda0 9331adf 373964e decbda0 9331adf 373964e d23df15 373964e d23df15 decbda0 9331adf d23df15 3d64e96 373964e d23df15 373964e d23df15 373964e d23df15 373964e 3d64e96 373964e d23df15 373964e d23df15 373964e d23df15 373964e 3d64e96 d23df15 3d64e96 373964e 3d64e96 d23df15 3d64e96 373964e d23df15 3d64e96 decbda0 373964e decbda0 373964e 3d64e96 d23df15 373964e d23df15 3d64e96 d23df15 373964e 3d64e96 d23df15 3d64e96 373964e d23df15 373964e d23df15 373964e d23df15 373964e decbda0 3d64e96 decbda0 373964e decbda0 3d64e96 d23df15 3d64e96 d23df15 373964e d23df15 373964e decbda0 373964e d23df15 373964e d23df15 decbda0 d23df15 decbda0 d23df15 373964e d23df15 fd46c1d d23df15 fd46c1d d23df15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import zipfile
import tempfile
import shutil
import gradio as gr
def extract_detail_page_links(url, headers):
"""
Extract all detail page links from the main listing page.
Args:
url: Main page URL
headers: Request headers
Returns:
list of detail page URLs
"""
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
detail_links = []
for link in soup.find_all('a', href=True):
href = link['href']
# Look for detail page patterns (adjust pattern as needed)
if 'Details.aspx' in href or 'PUB_ID=' in href:
full_url = urljoin(url, href)
if full_url not in detail_links:
detail_links.append(full_url)
return detail_links
def extract_pdf_links_from_page(url, headers):
"""
Extract PDF links from a single page.
Args:
url: Page URL to scrape
headers: Request headers
Returns:
list of PDF URLs
"""
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
pdf_links = []
for link in soup.find_all('a', href=True):
href = link['href']
if '.pdf' in href.lower():
full_url = urljoin(url, href)
if full_url not in pdf_links:
pdf_links.append(full_url)
return pdf_links
except Exception as e:
print(f"Error extracting PDFs from {url}: {str(e)}")
return []
def download_pdfs_from_page(url, progress=gr.Progress()):
"""
Download all PDFs from a webpage by navigating through detail pages.
Args:
url: The main webpage URL to scrape
progress: Gradio progress tracker
Returns:
tuple of (zip_file_path, summary_message)
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
# Step 1: Extract detail page links from main page
progress(0, desc="Fetching main page and extracting detail links...")
detail_page_links = extract_detail_page_links(url, headers)
if len(detail_page_links) == 0:
return None, "β No detail page links found on the main page."
progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")
# Step 2: Visit each detail page and collect PDF links
all_pdf_links = []
for idx, detail_url in enumerate(detail_page_links, 1):
progress(0.1 + (0.3 * idx / len(detail_page_links)),
desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")
pdf_links = extract_pdf_links_from_page(detail_url, headers)
all_pdf_links.extend(pdf_links)
# Be polite - small delay between page requests
time.sleep(0.5)
# Remove duplicates
all_pdf_links = list(set(all_pdf_links))
if len(all_pdf_links) == 0:
return None, f"β No PDF links found across {len(detail_page_links)} detail pages."
progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")
# Step 3: Create temporary directory for downloads
temp_dir = tempfile.mkdtemp()
# Step 4: Download each PDF
successful = 0
failed = 0
failed_urls = []
for idx, pdf_url in enumerate(all_pdf_links, 1):
try:
parsed_url = urlparse(pdf_url)
path_without_query = parsed_url.path
filename = os.path.basename(path_without_query)
# Handle empty filenames
if not filename or filename == '':
filename = f"document_{idx}.pdf"
filepath = os.path.join(temp_dir, filename)
# Skip if file already exists
if os.path.exists(filepath):
progress(0.4 + (0.5 * idx / len(all_pdf_links)),
desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
successful += 1
continue
progress(0.4 + (0.5 * idx / len(all_pdf_links)),
desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")
# Download PDF
pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
pdf_response.raise_for_status()
# Verify it's actually a PDF
if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
failed += 1
failed_urls.append(f"{filename}: Not a valid PDF file")
continue
# Save PDF
with open(filepath, 'wb') as f:
f.write(pdf_response.content)
successful += 1
time.sleep(1) # Be polite
except Exception as e:
failed += 1
failed_urls.append(f"{filename}: {str(e)}")
continue
# Step 5: Generate summary
summary = f"""
β
**Download Complete!**
π **Summary:**
- Detail pages scanned: {len(detail_page_links)}
- Total PDFs found: {len(all_pdf_links)}
- Successfully downloaded: {successful}
- Failed: {failed}
"""
if failed > 0:
summary += f"\n\nβ οΈ **Failed Downloads:**\n"
for fail in failed_urls[:10]:
summary += f"- {fail}\n"
if len(failed_urls) > 10:
summary += f"- ... and {len(failed_urls) - 10} more\n"
# Step 6: Create zip file
progress(0.9, desc="Creating zip file...")
zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, arcname=file)
# Clean up
shutil.rmtree(temp_dir)
progress(1.0, desc="Complete!")
return zip_path, summary
except requests.exceptions.RequestException as e:
return None, f"β Error fetching webpage: {str(e)}"
except Exception as e:
return None, f"β Unexpected error: {str(e)}"
# Create Gradio interface
def create_interface():
with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# π₯ Two-Level PDF Downloader
Download all PDFs from webpages with intermediate detail pages!
**Instructions:**
1. Enter the URL of the main listing page
2. Click "Download PDFs"
3. The tool will navigate through all detail pages
4. Download your ZIP file with all PDFs
"""
)
with gr.Row():
with gr.Column():
url_input = gr.Textbox(
label="Main Page URL",
placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
lines=1
)
download_btn = gr.Button("π₯ Download PDFs", variant="primary", size="lg")
with gr.Row():
with gr.Column():
output_file = gr.File(label="Download ZIP")
summary_output = gr.Markdown(label="Summary")
download_btn.click(
fn=download_pdfs_from_page,
inputs=[url_input],
outputs=[output_file, summary_output]
)
gr.Markdown(
"""
---
### π‘ Features:
- **Two-level navigation**: Scans main page β visits detail pages β downloads PDFs
- **Duplicate removal**: Ensures each PDF is downloaded only once
- **Polite scraping**: Includes delays between requests
- **Error handling**: Continues even if some downloads fail
- **Progress tracking**: Real-time updates on scanning and downloading
"""
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(share=True) |