File size: 2,129 Bytes

c7c6bc0

import os
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

class SovereignScraper:
    """
    Ingestion engine for ARAVALLI-1. 
    Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
    """
    def __init__(self, base_url, storage_path="data/raw/"):
        self.base_url = base_url
        self.storage_path = storage_path
        if not os.path.exists(self.storage_path):
            os.makedirs(self.storage_path)

    def crawl_and_download(self, file_ext=".pdf"):
        """Crawls the index page and downloads relevant files for training."""
        print(f"Initiating Sovereign Ingestion from: {self.base_url}")
        try:
            response = requests.get(self.base_url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Identify links to documents (Gazettes/Reports)
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                if href.endswith(file_ext):
                    download_url = urljoin(self.base_url, href)
                    file_name = href.split('/')[-1]
                    self._save_file(download_url, file_name)
                    
                    # Respectful delay to prevent server overload
                    time.sleep(1) 
        except Exception as e:
            print(f"Ingestion Breach: {e}")

    def _save_file(self, url, name):
        """Saves the raw data to the sovereign vault."""
        path = os.path.join(self.storage_path, name)
        if os.path.exists(path):
            return # Skip if already ingested
            
        print(f"Ingesting: {name}")
        r = requests.get(url, stream=True)
        with open(path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

if __name__ == "__main__":
    # Example Target: The Gazette of India (Archive Node)
    # Note: Use specific URLs for Environmental Impact Assessments
    scraper = SovereignScraper("https://egazette.gov.in/") 
    scraper.crawl_and_download()