File size: 3,525 Bytes
4851501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
HDX Data Ingestion Script

Downloads and processes humanitarian datasets from the Humanitarian Data Exchange (HDX)
for Panama, including population, health facilities, and other indicators.
"""

import httpx
import json
import os
import asyncio
from pathlib import Path

# HDX API Base URL
HDX_API = "https://data.humdata.org/api/3"

# Datasets to download (name -> HDX dataset ID)
DATASETS = {
    "population_worldpop": "worldpop-population-counts-for-panama",
    "admin_boundaries": "cod-ab-pan",
    "health_facilities": "panama-healthsites",
}

DATA_DIR = Path(__file__).parent.parent.parent / "data"
RAW_DIR = DATA_DIR / "raw" / "hdx"
PROCESSED_DIR = DATA_DIR / "processed"

def ensure_dirs():
    """Create data directories if they don't exist."""
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    (PROCESSED_DIR / "demographics").mkdir(exist_ok=True)
    (PROCESSED_DIR / "health").mkdir(exist_ok=True)
    (PROCESSED_DIR / "infrastructure").mkdir(exist_ok=True)

async def get_dataset_resources(client: httpx.AsyncClient, dataset_id: str) -> list:
    """Get list of downloadable resources for a dataset."""
    try:
        response = await client.get(f"{HDX_API}/action/package_show", params={"id": dataset_id})
        response.raise_for_status()
        data = response.json()
        
        if data.get("success"):
            return data["result"].get("resources", [])
        return []
    except Exception as e:
        print(f"Error fetching dataset {dataset_id}: {e}")
        return []

async def download_resource(client: httpx.AsyncClient, resource: dict, output_dir: Path) -> str:
    """Download a single resource file."""
    url = resource.get("url")
    name = resource.get("name", "unknown")
    format = resource.get("format", "").lower()
    
    # Skip non-data formats
    if format not in ["csv", "json", "geojson", "xlsx", "xls", "zip"]:
        return None
    
    filename = f"{name}.{format}"
    filepath = output_dir / filename
    
    # Skip if already downloaded
    if filepath.exists():
        print(f"  Skipping (exists): {filename}")
        return str(filepath)
    
    print(f"  Downloading: {filename}")
    try:
        response = await client.get(url, follow_redirects=True)
        response.raise_for_status()
        
        with open(filepath, "wb") as f:
            f.write(response.content)
        
        return str(filepath)
    except Exception as e:
        print(f"  Error downloading {name}: {e}")
        return None

async def ingest_hdx_datasets():
    """Main ingestion function."""
    ensure_dirs()
    
    print("=" * 60)
    print("HDX Data Ingestion for Panama")
    print("=" * 60)
    
    async with httpx.AsyncClient(timeout=60.0) as client:
        for name, dataset_id in DATASETS.items():
            print(f"\n📦 Dataset: {name} ({dataset_id})")
            
            # Create dataset-specific directory
            dataset_dir = RAW_DIR / name
            dataset_dir.mkdir(exist_ok=True)
            
            # Get resources
            resources = await get_dataset_resources(client, dataset_id)
            print(f"  Found {len(resources)} resources")
            
            # Download each resource
            for resource in resources:
                await download_resource(client, resource, dataset_dir)
    
    print("\n" + "=" * 60)
    print("Ingestion complete!")
    print("=" * 60)

if __name__ == "__main__":
    asyncio.run(ingest_hdx_datasets())