Transformers
English
Hindi
Sanskrit
sovereign-ai
ecological-intelligence
indian-llm
environmental-protection
ARAVALLI-1 / data /scripts /scraper.py
iamkoder001's picture
Create data/scripts/scraper.py
c7c6bc0 verified
import os
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
class SovereignScraper:
"""
Ingestion engine for ARAVALLI-1.
Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
"""
def __init__(self, base_url, storage_path="data/raw/"):
self.base_url = base_url
self.storage_path = storage_path
if not os.path.exists(self.storage_path):
os.makedirs(self.storage_path)
def crawl_and_download(self, file_ext=".pdf"):
"""Crawls the index page and downloads relevant files for training."""
print(f"Initiating Sovereign Ingestion from: {self.base_url}")
try:
response = requests.get(self.base_url, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
# Identify links to documents (Gazettes/Reports)
links = soup.find_all('a', href=True)
for link in links:
href = link['href']
if href.endswith(file_ext):
download_url = urljoin(self.base_url, href)
file_name = href.split('/')[-1]
self._save_file(download_url, file_name)
# Respectful delay to prevent server overload
time.sleep(1)
except Exception as e:
print(f"Ingestion Breach: {e}")
def _save_file(self, url, name):
"""Saves the raw data to the sovereign vault."""
path = os.path.join(self.storage_path, name)
if os.path.exists(path):
return # Skip if already ingested
print(f"Ingesting: {name}")
r = requests.get(url, stream=True)
with open(path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
if __name__ == "__main__":
# Example Target: The Gazette of India (Archive Node)
# Note: Use specific URLs for Environmental Impact Assessments
scraper = SovereignScraper("https://egazette.gov.in/")
scraper.crawl_and_download()