File size: 8,935 Bytes
3d64e96
 
 
 
 
 
d23df15
3d64e96
decbda0
9331adf
373964e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
decbda0
9331adf
373964e
d23df15
 
373964e
d23df15
 
 
decbda0
9331adf
d23df15
 
 
 
3d64e96
 
373964e
 
 
d23df15
373964e
 
d23df15
373964e
d23df15
373964e
 
 
 
 
 
 
 
 
 
 
3d64e96
373964e
 
d23df15
373964e
 
d23df15
373964e
 
 
d23df15
 
373964e
3d64e96
 
d23df15
3d64e96
373964e
3d64e96
 
d23df15
 
3d64e96
373964e
 
 
 
d23df15
3d64e96
decbda0
 
373964e
 
decbda0
 
 
373964e
 
3d64e96
d23df15
 
 
 
373964e
 
 
 
 
 
d23df15
 
 
3d64e96
d23df15
373964e
3d64e96
 
 
d23df15
3d64e96
 
373964e
d23df15
 
 
 
373964e
 
d23df15
 
 
 
 
 
373964e
d23df15
 
 
 
373964e
decbda0
3d64e96
decbda0
 
 
 
 
 
 
373964e
decbda0
 
 
 
3d64e96
d23df15
 
3d64e96
 
 
d23df15
 
 
 
 
373964e
 
d23df15
 
373964e
decbda0
373964e
 
d23df15
 
 
 
 
 
373964e
 
d23df15
 
 
 
 
 
 
decbda0
d23df15
 
 
 
decbda0
d23df15
 
 
 
 
 
373964e
 
 
 
 
 
d23df15
 
fd46c1d
d23df15
fd46c1d
 
d23df15
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
import time
import zipfile
import tempfile
import shutil
import gradio as gr

def extract_detail_page_links(url, headers):
    """
    Extract all detail page links from the main listing page.
    
    Args:
        url: Main page URL
        headers: Request headers
    
    Returns:
        list of detail page URLs
    """
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    detail_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Look for detail page patterns (adjust pattern as needed)
        if 'Details.aspx' in href or 'PUB_ID=' in href:
            full_url = urljoin(url, href)
            if full_url not in detail_links:
                detail_links.append(full_url)
    
    return detail_links

def extract_pdf_links_from_page(url, headers):
    """
    Extract PDF links from a single page.
    
    Args:
        url: Page URL to scrape
        headers: Request headers
    
    Returns:
        list of PDF URLs
    """
    try:
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        pdf_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if '.pdf' in href.lower():
                full_url = urljoin(url, href)
                if full_url not in pdf_links:
                    pdf_links.append(full_url)
        
        return pdf_links
    except Exception as e:
        print(f"Error extracting PDFs from {url}: {str(e)}")
        return []

def download_pdfs_from_page(url, progress=gr.Progress()):
    """
    Download all PDFs from a webpage by navigating through detail pages.
    
    Args:
        url: The main webpage URL to scrape
        progress: Gradio progress tracker
    
    Returns:
        tuple of (zip_file_path, summary_message)
    """
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        # Step 1: Extract detail page links from main page
        progress(0, desc="Fetching main page and extracting detail links...")
        detail_page_links = extract_detail_page_links(url, headers)
        
        if len(detail_page_links) == 0:
            return None, "❌ No detail page links found on the main page."
        
        progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")
        
        # Step 2: Visit each detail page and collect PDF links
        all_pdf_links = []
        for idx, detail_url in enumerate(detail_page_links, 1):
            progress(0.1 + (0.3 * idx / len(detail_page_links)), 
                    desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")
            
            pdf_links = extract_pdf_links_from_page(detail_url, headers)
            all_pdf_links.extend(pdf_links)
            
            # Be polite - small delay between page requests
            time.sleep(0.5)
        
        # Remove duplicates
        all_pdf_links = list(set(all_pdf_links))
        
        if len(all_pdf_links) == 0:
            return None, f"❌ No PDF links found across {len(detail_page_links)} detail pages."
        
        progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")
        
        # Step 3: Create temporary directory for downloads
        temp_dir = tempfile.mkdtemp()
        
        # Step 4: Download each PDF
        successful = 0
        failed = 0
        failed_urls = []
        
        for idx, pdf_url in enumerate(all_pdf_links, 1):
            try:
                parsed_url = urlparse(pdf_url)
                path_without_query = parsed_url.path
                filename = os.path.basename(path_without_query)
                
                # Handle empty filenames
                if not filename or filename == '':
                    filename = f"document_{idx}.pdf"
                
                filepath = os.path.join(temp_dir, filename)
                
                # Skip if file already exists
                if os.path.exists(filepath):
                    progress(0.4 + (0.5 * idx / len(all_pdf_links)), 
                            desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
                    successful += 1
                    continue
                
                progress(0.4 + (0.5 * idx / len(all_pdf_links)), 
                        desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")
                
                # Download PDF
                pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
                pdf_response.raise_for_status()
                
                # Verify it's actually a PDF
                if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
                    failed += 1
                    failed_urls.append(f"{filename}: Not a valid PDF file")
                    continue
                
                # Save PDF
                with open(filepath, 'wb') as f:
                    f.write(pdf_response.content)
                
                successful += 1
                time.sleep(1)  # Be polite
                
            except Exception as e:
                failed += 1
                failed_urls.append(f"{filename}: {str(e)}")
                continue
        
        # Step 5: Generate summary
        summary = f"""
βœ… **Download Complete!**

πŸ“Š **Summary:**
- Detail pages scanned: {len(detail_page_links)}
- Total PDFs found: {len(all_pdf_links)}
- Successfully downloaded: {successful}
- Failed: {failed}
"""
        
        if failed > 0:
            summary += f"\n\n⚠️ **Failed Downloads:**\n"
            for fail in failed_urls[:10]:
                summary += f"- {fail}\n"
            if len(failed_urls) > 10:
                summary += f"- ... and {len(failed_urls) - 10} more\n"
        
        # Step 6: Create zip file
        progress(0.9, desc="Creating zip file...")
        
        zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(temp_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, arcname=file)
        
        # Clean up
        shutil.rmtree(temp_dir)
        
        progress(1.0, desc="Complete!")
        return zip_path, summary
        
    except requests.exceptions.RequestException as e:
        return None, f"❌ Error fetching webpage: {str(e)}"
    except Exception as e:
        return None, f"❌ Unexpected error: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # πŸ“₯ Two-Level PDF Downloader
            Download all PDFs from webpages with intermediate detail pages!
            
            **Instructions:**
            1. Enter the URL of the main listing page
            2. Click "Download PDFs"
            3. The tool will navigate through all detail pages
            4. Download your ZIP file with all PDFs
            """
        )
        
        with gr.Row():
            with gr.Column():
                url_input = gr.Textbox(
                    label="Main Page URL",
                    placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
                    lines=1
                )
                
                download_btn = gr.Button("πŸ“₯ Download PDFs", variant="primary", size="lg")
        
        with gr.Row():
            with gr.Column():
                output_file = gr.File(label="Download ZIP")
                summary_output = gr.Markdown(label="Summary")
        
        download_btn.click(
            fn=download_pdfs_from_page,
            inputs=[url_input],
            outputs=[output_file, summary_output]
        )
        
        gr.Markdown(
            """
            ---
            ### πŸ’‘ Features:
            - **Two-level navigation**: Scans main page β†’ visits detail pages β†’ downloads PDFs
            - **Duplicate removal**: Ensures each PDF is downloaded only once
            - **Polite scraping**: Includes delays between requests
            - **Error handling**: Continues even if some downloads fail
            - **Progress tracking**: Real-time updates on scanning and downloading
            """
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)