| | import os |
| | import requests |
| | from bs4 import BeautifulSoup |
| | import time |
| | from urllib.parse import urljoin |
| |
|
| | class SovereignScraper: |
| | """ |
| | Ingestion engine for ARAVALLI-1. |
| | Focus: Indian Gazettes, Environmental Reports, and Global Satellite Metadata. |
| | """ |
| | def __init__(self, base_url, storage_path="data/raw/"): |
| | self.base_url = base_url |
| | self.storage_path = storage_path |
| | if not os.path.exists(self.storage_path): |
| | os.makedirs(self.storage_path) |
| |
|
| | def crawl_and_download(self, file_ext=".pdf"): |
| | """Crawls the index page and downloads relevant files for training.""" |
| | print(f"Initiating Sovereign Ingestion from: {self.base_url}") |
| | try: |
| | response = requests.get(self.base_url, timeout=15) |
| | soup = BeautifulSoup(response.text, 'html.parser') |
| | |
| | |
| | links = soup.find_all('a', href=True) |
| | for link in links: |
| | href = link['href'] |
| | if href.endswith(file_ext): |
| | download_url = urljoin(self.base_url, href) |
| | file_name = href.split('/')[-1] |
| | self._save_file(download_url, file_name) |
| | |
| | |
| | time.sleep(1) |
| | except Exception as e: |
| | print(f"Ingestion Breach: {e}") |
| |
|
| | def _save_file(self, url, name): |
| | """Saves the raw data to the sovereign vault.""" |
| | path = os.path.join(self.storage_path, name) |
| | if os.path.exists(path): |
| | return |
| | |
| | print(f"Ingesting: {name}") |
| | r = requests.get(url, stream=True) |
| | with open(path, 'wb') as f: |
| | for chunk in r.iter_content(chunk_size=8192): |
| | f.write(chunk) |
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | scraper = SovereignScraper("https://egazette.gov.in/") |
| | scraper.crawl_and_download() |
| |
|