File size: 2,129 Bytes
c7c6bc0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import os
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
class SovereignScraper:
"""
Ingestion engine for ARAVALLI-1.
Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
"""
def __init__(self, base_url, storage_path="data/raw/"):
self.base_url = base_url
self.storage_path = storage_path
if not os.path.exists(self.storage_path):
os.makedirs(self.storage_path)
def crawl_and_download(self, file_ext=".pdf"):
"""Crawls the index page and downloads relevant files for training."""
print(f"Initiating Sovereign Ingestion from: {self.base_url}")
try:
response = requests.get(self.base_url, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
# Identify links to documents (Gazettes/Reports)
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
if href.endswith(file_ext):
download_url = urljoin(self.base_url, href)
file_name = href.split('/')[-1]
self._save_file(download_url, file_name)
# Respectful delay to prevent server overload
time.sleep(1)
except Exception as e:
print(f"Ingestion Breach: {e}")
def _save_file(self, url, name):
"""Saves the raw data to the sovereign vault."""
path = os.path.join(self.storage_path, name)
if os.path.exists(path):
return # Skip if already ingested
print(f"Ingesting: {name}")
r = requests.get(url, stream=True)
with open(path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
if __name__ == "__main__":
# Example Target: The Gazette of India (Archive Node)
# Note: Use specific URLs for Environmental Impact Assessments
scraper = SovereignScraper("https://egazette.gov.in/")
scraper.crawl_and_download()
|