import os import requests from bs4 import BeautifulSoup import time from urllib.parse import urljoin class SovereignScraper: """ Ingestion engine for ARAVALLI-1. Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata. """ def __init__(self, base_url, storage_path="data/raw/"): self.base_url = base_url self.storage_path = storage_path if not os.path.exists(self.storage_path): os.makedirs(self.storage_path) def crawl_and_download(self, file_ext=".pdf"): """Crawls the index page and downloads relevant files for training.""" print(f"Initiating Sovereign Ingestion from: {self.base_url}") try: response = requests.get(self.base_url, timeout=15) soup = BeautifulSoup(response.text, 'html.parser') # Identify links to documents (Gazettes/Reports) links = soup.find_all('a', href=True) for link in links: href = link['href'] if href.endswith(file_ext): download_url = urljoin(self.base_url, href) file_name = href.split('/')[-1] self._save_file(download_url, file_name) # Respectful delay to prevent server overload time.sleep(1) except Exception as e: print(f"Ingestion Breach: {e}") def _save_file(self, url, name): """Saves the raw data to the sovereign vault.""" path = os.path.join(self.storage_path, name) if os.path.exists(path): return # Skip if already ingested print(f"Ingesting: {name}") r = requests.get(url, stream=True) with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) if __name__ == "__main__": # Example Target: The Gazette of India (Archive Node) # Note: Use specific URLs for Environmental Impact Assessments scraper = SovereignScraper("https://egazette.gov.in/") scraper.crawl_and_download()