GeoQuery / backend /scripts /ingest_hdx.py
GerardCB's picture
Deploy to Spaces (Final Clean)
4851501
"""
HDX Data Ingestion Script
Downloads and processes humanitarian datasets from the Humanitarian Data Exchange (HDX)
for Panama, including population, health facilities, and other indicators.
"""
import httpx
import json
import os
import asyncio
from pathlib import Path
# HDX API Base URL
HDX_API = "https://data.humdata.org/api/3"
# Datasets to download (name -> HDX dataset ID)
DATASETS = {
"population_worldpop": "worldpop-population-counts-for-panama",
"admin_boundaries": "cod-ab-pan",
"health_facilities": "panama-healthsites",
}
DATA_DIR = Path(__file__).parent.parent.parent / "data"
RAW_DIR = DATA_DIR / "raw" / "hdx"
PROCESSED_DIR = DATA_DIR / "processed"
def ensure_dirs():
"""Create data directories if they don't exist."""
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
(PROCESSED_DIR / "demographics").mkdir(exist_ok=True)
(PROCESSED_DIR / "health").mkdir(exist_ok=True)
(PROCESSED_DIR / "infrastructure").mkdir(exist_ok=True)
async def get_dataset_resources(client: httpx.AsyncClient, dataset_id: str) -> list:
"""Get list of downloadable resources for a dataset."""
try:
response = await client.get(f"{HDX_API}/action/package_show", params={"id": dataset_id})
response.raise_for_status()
data = response.json()
if data.get("success"):
return data["result"].get("resources", [])
return []
except Exception as e:
print(f"Error fetching dataset {dataset_id}: {e}")
return []
async def download_resource(client: httpx.AsyncClient, resource: dict, output_dir: Path) -> str:
"""Download a single resource file."""
url = resource.get("url")
name = resource.get("name", "unknown")
format = resource.get("format", "").lower()
# Skip non-data formats
if format not in ["csv", "json", "geojson", "xlsx", "xls", "zip"]:
return None
filename = f"{name}.{format}"
filepath = output_dir / filename
# Skip if already downloaded
if filepath.exists():
print(f" Skipping (exists): {filename}")
return str(filepath)
print(f" Downloading: {filename}")
try:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
with open(filepath, "wb") as f:
f.write(response.content)
return str(filepath)
except Exception as e:
print(f" Error downloading {name}: {e}")
return None
async def ingest_hdx_datasets():
"""Main ingestion function."""
ensure_dirs()
print("=" * 60)
print("HDX Data Ingestion for Panama")
print("=" * 60)
async with httpx.AsyncClient(timeout=60.0) as client:
for name, dataset_id in DATASETS.items():
print(f"\n📦 Dataset: {name} ({dataset_id})")
# Create dataset-specific directory
dataset_dir = RAW_DIR / name
dataset_dir.mkdir(exist_ok=True)
# Get resources
resources = await get_dataset_resources(client, dataset_id)
print(f" Found {len(resources)} resources")
# Download each resource
for resource in resources:
await download_resource(client, resource, dataset_dir)
print("\n" + "=" * 60)
print("Ingestion complete!")
print("=" * 60)
if __name__ == "__main__":
asyncio.run(ingest_hdx_datasets())