iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

ARAVALLI-1 / data /scripts /scraper.py

iamkoder001's picture

Create data/scripts/scraper.py

c7c6bc0 verified 4 days ago

history blame contribute delete

2.13 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	import time
	from urllib.parse import urljoin

	class SovereignScraper:
	"""
	Ingestion engine for ARAVALLI-1.
	Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata.
	"""
	def __init__(self, base_url, storage_path="data/raw/"):
	self.base_url = base_url
	self.storage_path = storage_path
	if not os.path.exists(self.storage_path):
	os.makedirs(self.storage_path)

	def crawl_and_download(self, file_ext=".pdf"):
	"""Crawls the index page and downloads relevant files for training."""
	print(f"Initiating Sovereign Ingestion from: {self.base_url}")
	try:
	response = requests.get(self.base_url, timeout=15)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Identify links to documents (Gazettes/Reports)
	links = soup.find_all('a', href=True)
	for link in links:
	href = link['href']
	if href.endswith(file_ext):
	download_url = urljoin(self.base_url, href)
	file_name = href.split('/')[-1]
	self._save_file(download_url, file_name)

	# Respectful delay to prevent server overload
	time.sleep(1)
	except Exception as e:
	print(f"Ingestion Breach: {e}")

	def _save_file(self, url, name):
	"""Saves the raw data to the sovereign vault."""
	path = os.path.join(self.storage_path, name)
	if os.path.exists(path):
	return # Skip if already ingested

	print(f"Ingesting: {name}")
	r = requests.get(url, stream=True)
	with open(path, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)

	if __name__ == "__main__":
	# Example Target: The Gazette of India (Archive Node)
	# Note: Use specific URLs for Environmental Impact Assessments
	scraper = SovereignScraper("https://egazette.gov.in/")
	scraper.crawl_and_download()