Spaces:

GerardCB
/

GeoQuery

Running

File size: 4,719 Bytes
#!/usr/bin/env python3
"""
Process World Bank indicators and create GeoJSON layers
Joins most recent indicator data to Panama administrative boundaries
"""

import pandas as pd
import geopandas as gpd
from pathlib import Path
import logging
import json

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

DATA_DIR = Path(__file__).parent.parent / "data"
WB_DIR = DATA_DIR / "worldbank"
BASE_DIR = DATA_DIR / "base"
OUTPUT_DIR = DATA_DIR / "socioeconomic"

def load_admin_boundaries():
    """Load Panama administrative boundaries as GeoDataFrame"""
    admin1_path = BASE_DIR / "pan_admin1.geojson"
    
    if not admin1_path.exists():
        logger.error(f"Admin boundaries not found: {admin1_path}")
        return None
    
    gdf = gpd.read_file(admin1_path)
    logger.info(f"Loaded {len(gdf)} provinces")
    return gdf

def process_indicators():
    """Load and process World Bank indicators"""
    csv_path = WB_DIR / "panama_indicators.csv"
    
    if not csv_path.exists():
        logger.error(f"Indicators file not found: {csv_path}")
        return None
    
    df = pd.read_csv(csv_path)
    logger.info(f"Loaded {len(df)} indicator records")
    
    # Get most recent year for each indicator
    latest_df = df.loc[df.groupby('indicator_code')['year'].idxmax()]
    logger.info(f"Selected most recent data for {len(latest_df)} indicators")
    
    return latest_df

def create_national_geojson(indicators_df, admin_gdf):
    """Create GeoJSON for national-level indicators"""
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    
    # Since WB data is national-level, we'll attach it to the country boundary (admin0)
    # For now, create a simple point feature at Panama's center with the indicators
    
    features = []
    
    # Create one feature with all latest indicators
    properties = {
        'country': 'Panama',
        'data_year': int(indicators_df['year'].max())
    }
    
    # Add each indicator as a property
    for _, row in indicators_df.iterrows():
        # Create clean column name (remove special chars)
        col_name = row['indicator_code'].lower().replace('.', '_')
        properties[col_name] = row['value']
        properties[f"{col_name}_name"] = row['indicator_name']
    
    # Use Panama's approximate center
    feature = {
        "type": "Feature",
        "geometry": {
            "type": "Point",
            "coordinates": [-80.0, 8.5]  # Approximate center of Panama
        },
        "properties": properties
    }
    
    geojson = {
        "type": "FeatureCollection",
        "features": [feature]
    }
    
    # Save GeoJSON
    output_file = OUTPUT_DIR / "panama_national_indicators.geojson"
    with open(output_file, 'w') as f:
        json.dump(geojson, f, indent=2)
    
    logger.info(f"Created national indicators GeoJSON: {output_file}")
    logger.info(f"  Indicators included: {len(indicators_df)}")
    
    return output_file

def update_catalog(geojson_path):
    """Add the new dataset to catalog.json"""
    catalog_path = DATA_DIR / "catalog.json"
    
    with open(catalog_path, 'r') as f:
        catalog = json.load(f)
    
    # Add new entry
    catalog["panama_national_indicators"] = {
        "path": str(geojson_path.relative_to(DATA_DIR)),
        "description": "National socio-economic indicators from World Bank (2000-2024)",
        "semantic_description": "Comprehensive national-level statistics for Panama including poverty rates, GDP, unemployment, health expenditure, maternal/child mortality, literacy rates, and school enrollment. Data sourced from World Bank Open Data API. Use this dataset for analyzing Panama's socio-economic development trends over time.",
        "tags": [
            "socioeconomic",
            "worldbank",
            "poverty",
            "gdp",
            "employment",
            "health",
            "education",
            "national",
            "panama"
        ],
        "data_type": "static",
        "category": "socioeconomic",
        "format": "geojson"
    }
    
    with open(catalog_path, 'w') as f:
        json.dump(catalog, f, indent=2)
    
    logger.info("Updated catalog.json")

def main():
    logger.info("Processing World Bank indicators...")
    
    # Load data
    admin_gdf = load_admin_boundaries()
    indicators_df = process_indicators()
    
    if admin_gdf is None or indicators_df is None:
        logger.error("Failed to load required data")
        return
    
    # Create GeoJSON
    geojson_path = create_national_geojson(indicators_df, admin_gdf)
    
    # Update catalog
    update_catalog(geojson_path)
    
    logger.info("Processing complete!")

if __name__ == "__main__":
    main()