GerardCB commited on
Commit
c6a9e71
·
1 Parent(s): 4851501

Cleanup: Remove unused download scripts

Browse files
backend/scripts/download_geofabrik.py DELETED
@@ -1,192 +0,0 @@
1
- """
2
- Panama Data Ingestion - Phase A: OpenStreetMap via Geofabrik
3
-
4
- Downloads pre-packaged OSM data for Panama as shapefiles and converts to GeoJSON.
5
- Data source: https://download.geofabrik.de/central-america.html
6
- """
7
-
8
- import os
9
- import sys
10
- import zipfile
11
- import requests
12
- import subprocess
13
- from pathlib import Path
14
-
15
- # Panama Geofabrik URL
16
- GEOFABRIK_URL = "https://download.geofabrik.de/central-america/panama-latest-free.shp.zip"
17
-
18
- # Output directories
19
- DATA_DIR = Path(__file__).parent.parent / "data"
20
- OSM_DIR = DATA_DIR / "osm"
21
- TEMP_DIR = DATA_DIR / "temp"
22
-
23
- # OSM layers to extract
24
- OSM_LAYERS = [
25
- ("gis_osm_roads_free_1", "roads", "Road network with classification"),
26
- ("gis_osm_pois_free_1", "pois", "Points of interest (restaurants, shops, etc.)"),
27
- ("gis_osm_pois_a_free_1", "pois_areas", "POI areas (larger venues)"),
28
- ("gis_osm_buildings_a_free_1", "buildings", "Building footprints"),
29
- ("gis_osm_landuse_a_free_1", "landuse", "Land use zones (residential, commercial, etc.)"),
30
- ("gis_osm_natural_free_1", "natural_points", "Natural features (trees, peaks)"),
31
- ("gis_osm_natural_a_free_1", "natural_areas", "Natural areas (forests, parks)"),
32
- ("gis_osm_water_a_free_1", "water_areas", "Water bodies (lakes, reservoirs)"),
33
- ("gis_osm_waterways_free_1", "waterways", "Rivers and streams"),
34
- ("gis_osm_railways_free_1", "railways", "Railway lines"),
35
- ("gis_osm_traffic_free_1", "traffic", "Traffic infrastructure (signals, crossings)"),
36
- ("gis_osm_traffic_a_free_1", "traffic_areas", "Traffic areas (parking lots)"),
37
- ("gis_osm_transport_free_1", "transport", "Transport points (bus stops, stations)"),
38
- ("gis_osm_transport_a_free_1", "transport_areas", "Transport areas (airports, ports)"),
39
- ("gis_osm_places_free_1", "places", "Place names (cities, towns, villages)"),
40
- ("gis_osm_places_a_free_1", "places_areas", "Place areas"),
41
- ("gis_osm_pofw_free_1", "places_of_worship", "Places of worship"),
42
- ("gis_osm_pofw_a_free_1", "places_of_worship_areas", "Places of worship (buildings)"),
43
- ]
44
-
45
-
46
- def download_file(url: str, dest: Path) -> bool:
47
- """Download a file with progress indication."""
48
- print(f"📥 Downloading {url}...")
49
-
50
- try:
51
- response = requests.get(url, stream=True)
52
- response.raise_for_status()
53
-
54
- total_size = int(response.headers.get('content-length', 0))
55
- downloaded = 0
56
-
57
- with open(dest, 'wb') as f:
58
- for chunk in response.iter_content(chunk_size=8192):
59
- f.write(chunk)
60
- downloaded += len(chunk)
61
- if total_size > 0:
62
- pct = (downloaded / total_size) * 100
63
- print(f"\r Progress: {pct:.1f}% ({downloaded // 1024 // 1024}MB)", end="")
64
-
65
- print(f"\n✅ Downloaded to {dest}")
66
- return True
67
-
68
- except Exception as e:
69
- print(f"❌ Download failed: {e}")
70
- return False
71
-
72
-
73
- def convert_shp_to_geojson(shp_path: Path, geojson_path: Path) -> bool:
74
- """Convert shapefile to GeoJSON using ogr2ogr."""
75
- try:
76
- cmd = [
77
- "ogr2ogr",
78
- "-f", "GeoJSON",
79
- "-t_srs", "EPSG:4326", # Ensure WGS84
80
- str(geojson_path),
81
- str(shp_path)
82
- ]
83
- result = subprocess.run(cmd, capture_output=True, text=True)
84
-
85
- if result.returncode == 0:
86
- return True
87
- else:
88
- print(f" ogr2ogr error: {result.stderr}")
89
- return False
90
-
91
- except FileNotFoundError:
92
- print("⚠️ ogr2ogr not found. Please install GDAL:")
93
- print(" brew install gdal # macOS")
94
- print(" apt install gdal-bin # Ubuntu")
95
- return False
96
-
97
-
98
- def extract_and_convert():
99
- """Extract shapefiles from zip and convert to GeoJSON."""
100
-
101
- # Ensure directories exist
102
- OSM_DIR.mkdir(parents=True, exist_ok=True)
103
- TEMP_DIR.mkdir(parents=True, exist_ok=True)
104
-
105
- zip_path = TEMP_DIR / "panama-osm.zip"
106
-
107
- # Download if not exists
108
- if not zip_path.exists():
109
- if not download_file(GEOFABRIK_URL, zip_path):
110
- return False
111
- else:
112
- print(f"📦 Using cached {zip_path}")
113
-
114
- # Extract
115
- print(f"📂 Extracting to {TEMP_DIR}...")
116
- with zipfile.ZipFile(zip_path, 'r') as zf:
117
- zf.extractall(TEMP_DIR)
118
-
119
- # Convert each layer
120
- converted = 0
121
- for shp_name, output_name, description in OSM_LAYERS:
122
- shp_path = TEMP_DIR / f"{shp_name}.shp"
123
- geojson_path = OSM_DIR / f"{output_name}.geojson"
124
-
125
- if not shp_path.exists():
126
- print(f"⏭️ Skipping {shp_name} (not in download)")
127
- continue
128
-
129
- print(f"🔄 Converting {shp_name} → {output_name}.geojson...")
130
-
131
- if convert_shp_to_geojson(shp_path, geojson_path):
132
- # Get file size
133
- size_mb = geojson_path.stat().st_size / 1024 / 1024
134
- print(f" ✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
135
- converted += 1
136
- else:
137
- print(f" ❌ Failed to convert {shp_name}")
138
-
139
- print(f"\n🎉 Converted {converted}/{len(OSM_LAYERS)} OSM layers")
140
- return converted > 0
141
-
142
-
143
- def register_in_catalog():
144
- """Register OSM datasets in the catalog."""
145
- import json
146
-
147
- catalog_path = DATA_DIR / "catalog.json"
148
-
149
- if catalog_path.exists():
150
- with open(catalog_path) as f:
151
- catalog = json.load(f)
152
- else:
153
- catalog = {}
154
-
155
- for shp_name, output_name, description in OSM_LAYERS:
156
- geojson_path = OSM_DIR / f"{output_name}.geojson"
157
-
158
- if not geojson_path.exists():
159
- continue
160
-
161
- # Create catalog entry
162
- table_name = f"osm_{output_name}"
163
- rel_path = f"osm/{output_name}.geojson"
164
-
165
- catalog[table_name] = {
166
- "source_file": rel_path,
167
- "source_type": "geojson",
168
- "description": f"OpenStreetMap {description} for Panama",
169
- "tags": ["osm", "panama", output_name.replace("_", " ")],
170
- "data_type": "vector",
171
- "geometry_type": "auto" # Will be detected on load
172
- }
173
-
174
- print(f"📝 Registered {table_name}")
175
-
176
- with open(catalog_path, 'w') as f:
177
- json.dump(catalog, f, indent=2)
178
-
179
- print(f"✅ Updated catalog with OSM datasets")
180
-
181
-
182
- if __name__ == "__main__":
183
- print("=" * 60)
184
- print("🗺️ Panama OSM Data Ingestion (Geofabrik)")
185
- print("=" * 60)
186
-
187
- if extract_and_convert():
188
- register_in_catalog()
189
- print("\n🚀 OSM data ready! Restart the backend to load new datasets.")
190
- else:
191
- print("\n❌ Ingestion failed")
192
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_global_datasets.py DELETED
@@ -1,133 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Download global geo-referenced datasets for Panama
4
- - OurAirports: Global airport database
5
- - WRI Global Power Plant Database
6
- - Other infrastructure datasets
7
- """
8
-
9
- import requests
10
- import pandas as pd
11
- import geopandas as gpd
12
- from pathlib import Path
13
- import logging
14
-
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- DATA_DIR = Path(__file__).parent.parent / "data" / "global"
19
-
20
- # Dataset URLs
21
- DATASETS = {
22
- "airports": {
23
- "url": "https://davidmegginson.github.io/ourairports-data/airports.csv",
24
- "description": "OurAirports - Global airport database"
25
- },
26
- "power_plants": {
27
- "url": "https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3/global_power_plant_database.csv",
28
- "description": "WRI Global Power Plant Database v1.3"
29
- }
30
- }
31
-
32
- def download_airports():
33
- """Download and process OurAir ports data for Panama"""
34
- logger.info("Downloading OurAirports global database...")
35
-
36
- url = DATASETS["airports"]["url"]
37
- response = requests.get(url)
38
- response.raise_for_status()
39
-
40
- # Save raw CSV
41
- output_dir = DATA_DIR / "airports"
42
- output_dir.mkdir(parents=True, exist_ok=True)
43
-
44
- csv_path = output_dir / "airports_global.csv"
45
- with open(csv_path, 'wb') as f:
46
- f.write(response.content)
47
-
48
- logger.info(f"Saved raw airports data: {csv_path}")
49
-
50
- # Filter for Panama (iso_country = PA)
51
- df = pd.read_csv(csv_path)
52
- panama_df = df[df['iso_country'] == 'PA'].copy()
53
-
54
- logger.info(f"Found {len(panama_df)} airports in Panama")
55
-
56
- # Convert to GeoDataFrame
57
- gdf = gpd.GeoDataFrame(
58
- panama_df,
59
- geometry=gpd.points_from_xy(panama_df.longitude_deg, panama_df.latitude_deg),
60
- crs="EPSG:4326"
61
- )
62
-
63
- # Save as GeoJSON
64
- geojson_path = output_dir / "panama_airports.geojson"
65
- gdf.to_file(geojson_path, driver='GeoJSON')
66
-
67
- logger.info(f"Created GeoJSON: {geojson_path}")
68
- return geojson_path, len(gdf)
69
-
70
- def download_power_plants():
71
- """Download and process WRI Global Power Plant Database for Panama"""
72
- logger.info("Downloading WRI Global Power Plant Database...")
73
-
74
- url = DATASETS["power_plants"]["url"]
75
- response = requests.get(url)
76
- response.raise_for_status()
77
-
78
- # Save raw CSV
79
- output_dir = DATA_DIR / "power_plants"
80
- output_dir.mkdir(parents=True, exist_ok=True)
81
-
82
- csv_path = output_dir / "power_plants_global.csv"
83
- with open(csv_path, 'wb') as f:
84
- f.write(response.content)
85
-
86
- logger.info(f"Saved raw power plants data: {csv_path}")
87
-
88
- # Filter for Panama (country = PAN)
89
- df = pd.read_csv(csv_path)
90
- panama_df = df[df['country'] == 'PAN'].copy()
91
-
92
- logger.info(f"Found {len(panama_df)} power plants in Panama")
93
-
94
- # Convert to GeoDataFrame
95
- gdf = gpd.GeoDataFrame(
96
- panama_df,
97
- geometry=gpd.points_from_xy(panama_df.longitude, panama_df.latitude),
98
- crs="EPSG:4326"
99
- )
100
-
101
- # Save as GeoJSON
102
- geojson_path = output_dir / "panama_power_plants.geojson"
103
- gdf.to_file(geojson_path, driver='GeoJSON')
104
-
105
- logger.info(f"Created GeoJSON: {geojson_path}")
106
- return geojson_path, len(gdf)
107
-
108
- def main():
109
- logger.info("=== Global Dataset Download Starting ===")
110
-
111
- results = []
112
-
113
- try:
114
- airports_path, airports_count = download_airports()
115
- results.append({"dataset": "airports", "count": airports_count, "path": airports_path})
116
- except Exception as e:
117
- logger.error(f"Failed to download airports: {e}")
118
-
119
- try:
120
- power_path, power_count = download_power_plants()
121
- results.append({"dataset": "power_plants", "count": power_count, "path": power_path})
122
- except Exception as e:
123
- logger.error(f"Failed to download power plants: {e}")
124
-
125
- logger.info("\n=== Download Summary ===")
126
- for result in results:
127
- logger.info(f" {result['dataset']}: {result['count']} features")
128
-
129
- logger.info("\n=== Complete ===")
130
- return results
131
-
132
- if __name__ == "__main__":
133
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_hdx.py DELETED
@@ -1,72 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- HDX Data Downloader for Panama
4
- Downloads official datasets from Humanitarian Data Exchange
5
- """
6
-
7
- import requests
8
- from pathlib import Path
9
- import logging
10
-
11
- logging.basicConfig(level=logging.INFO)
12
- logger = logging.getLogger(__name__)
13
-
14
- # HDX Dataset URLs (from research)
15
- HDX_DATASETS = {
16
- "health": {
17
- "name": "Panama - Health Indicators",
18
- "url": "https://data.humdata.org/dataset/4d3f9ab7-8e5c-4a24-ae5d-cfc3e81b4db6",
19
- "description": "WHO health indicators for Panama"
20
- },
21
- "education": {
22
- "name": "Panama - Education",
23
- "url": "https://data.humdata.org/dataset/panama-education-statistics",
24
- "description": "UNESCO/World Bank education statistics"
25
- },
26
- "economy": {
27
- "name": "Panama - Economy and Growth",
28
- "url": "https://data.humdata.org/dataset/panama-economy-indicators",
29
- "description": "World Bank economic indicators"
30
- }
31
- }
32
-
33
- DATA_DIR = Path(__file__).parent.parent / "data" / "hdx"
34
-
35
- def download_hdx_dataset(dataset_key: str):
36
- """Download a dataset from HDX"""
37
- dataset = HDX_DATASETS[dataset_key]
38
- logger.info(f"Downloading {dataset['name']}...")
39
-
40
- # Create output directory
41
- output_dir = DATA_DIR / dataset_key
42
- output_dir.mkdir(parents=True, exist_ok=True)
43
-
44
- try:
45
- # HDX datasets typically have resource download URLs
46
- # We'll need to parse the dataset page to get the actual download link
47
- response = requests.get(dataset['url'])
48
- response.raise_for_status()
49
-
50
- # Note: This is a placeholder - actual implementation would need to:
51
- # 1. Parse the HDX page HTML to find CSV/Excel download links
52
- # 2. Download each resource file
53
- # 3. Save to output_dir
54
-
55
- logger.info(f"Downloaded to {output_dir}")
56
- return output_dir
57
-
58
- except Exception as e:
59
- logger.error(f"Failed to download {dataset['name']}: {e}")
60
- return None
61
-
62
- def main():
63
- """Download all HDX datasets"""
64
- logger.info("Starting HDX data download...")
65
-
66
- for key in HDX_DATASETS.keys():
67
- download_hdx_dataset(key)
68
-
69
- logger.info("Download complete!")
70
-
71
- if __name__ == "__main__":
72
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_kontur.py DELETED
@@ -1,239 +0,0 @@
1
- """
2
- Panama Data Ingestion - Phase A: Kontur Population
3
-
4
- Downloads population density data from HDX (Humanitarian Data Exchange).
5
- Data source: https://data.humdata.org/dataset/kontur-population-panama
6
- """
7
-
8
- import os
9
- import sys
10
- import json
11
- import requests
12
- import gzip
13
- import shutil
14
- from pathlib import Path
15
-
16
- # HDX API for Kontur Population Panama
17
- HDX_DATASET_URL = "https://data.humdata.org/api/3/action/package_show?id=kontur-population-panama"
18
-
19
- # Output directories
20
- DATA_DIR = Path(__file__).parent.parent / "data"
21
- KONTUR_DIR = DATA_DIR / "kontur"
22
- TEMP_DIR = DATA_DIR / "temp"
23
-
24
-
25
- def get_download_url() -> str:
26
- """Fetch the actual download URL from HDX API."""
27
- print("🔍 Fetching download URL from HDX...")
28
-
29
- try:
30
- response = requests.get(HDX_DATASET_URL)
31
- response.raise_for_status()
32
- data = response.json()
33
-
34
- if not data.get("success"):
35
- print("❌ HDX API returned error")
36
- return None
37
-
38
- resources = data.get("result", {}).get("resources", [])
39
-
40
- # Look for GeoJSON or GPKG file
41
- for resource in resources:
42
- name = resource.get("name", "").lower()
43
- url = resource.get("url", "")
44
-
45
- if "geojson" in name or "gpkg" in name:
46
- print(f" Found: {resource.get('name')}")
47
- return url
48
-
49
- # Fallback to first resource
50
- if resources:
51
- return resources[0].get("url")
52
-
53
- return None
54
-
55
- except Exception as e:
56
- print(f"❌ Failed to fetch HDX metadata: {e}")
57
- return None
58
-
59
-
60
- def download_file(url: str, dest: Path) -> bool:
61
- """Download a file with progress indication."""
62
- print(f"📥 Downloading from {url[:80]}...")
63
-
64
- try:
65
- response = requests.get(url, stream=True)
66
- response.raise_for_status()
67
-
68
- total_size = int(response.headers.get('content-length', 0))
69
- downloaded = 0
70
-
71
- with open(dest, 'wb') as f:
72
- for chunk in response.iter_content(chunk_size=8192):
73
- f.write(chunk)
74
- downloaded += len(chunk)
75
- if total_size > 0:
76
- pct = (downloaded / total_size) * 100
77
- print(f"\r Progress: {pct:.1f}% ({downloaded // 1024}KB)", end="")
78
-
79
- print(f"\n✅ Downloaded to {dest}")
80
- return True
81
-
82
- except Exception as e:
83
- print(f"❌ Download failed: {e}")
84
- return False
85
-
86
-
87
- def decompress_if_needed(file_path: Path) -> Path:
88
- """Decompress .gz file if needed."""
89
- if file_path.suffix == '.gz':
90
- output_path = file_path.with_suffix('')
91
- print(f"📦 Decompressing {file_path.name}...")
92
-
93
- with gzip.open(file_path, 'rb') as f_in:
94
- with open(output_path, 'wb') as f_out:
95
- shutil.copyfileobj(f_in, f_out)
96
-
97
- return output_path
98
-
99
- return file_path
100
-
101
-
102
- def download_population_data():
103
- """Download Kontur Population data for Panama."""
104
-
105
- # Ensure directories exist
106
- KONTUR_DIR.mkdir(parents=True, exist_ok=True)
107
- TEMP_DIR.mkdir(parents=True, exist_ok=True)
108
-
109
- # Get download URL
110
- download_url = get_download_url()
111
-
112
- if not download_url:
113
- # Fallback to known URL pattern
114
- download_url = "https://geodata-eu-central-1-kontur-public.s3.amazonaws.com/kontur_datasets/kontur_population_PA_20231101.gpkg.gz"
115
- print(f"⚠️ Using fallback URL: {download_url}")
116
-
117
- # Determine filename
118
- filename = download_url.split("/")[-1]
119
- temp_path = TEMP_DIR / filename
120
-
121
- # Download
122
- if not temp_path.exists():
123
- if not download_file(download_url, temp_path):
124
- return None
125
- else:
126
- print(f"📦 Using cached {temp_path}")
127
-
128
- # Decompress if needed
129
- data_path = decompress_if_needed(temp_path)
130
-
131
- # Move to final location
132
- final_path = KONTUR_DIR / data_path.name
133
- if data_path != final_path:
134
- shutil.move(str(data_path), str(final_path))
135
-
136
- print(f"✅ Population data ready at {final_path}")
137
- return final_path
138
-
139
-
140
- def convert_gpkg_to_geojson(gpkg_path: Path) -> Path:
141
- """Convert GeoPackage to GeoJSON using ogr2ogr."""
142
- import subprocess
143
-
144
- geojson_path = gpkg_path.with_suffix('.geojson')
145
-
146
- print(f"🔄 Converting to GeoJSON...")
147
-
148
- try:
149
- # First, list layers in the GPKG
150
- result = subprocess.run(
151
- ["ogrinfo", "-so", str(gpkg_path)],
152
- capture_output=True, text=True
153
- )
154
-
155
- # Get the first layer name
156
- layer_name = None
157
- for line in result.stdout.split('\n'):
158
- if ': ' in line and 'using driver' not in line.lower():
159
- parts = line.split(':')
160
- if len(parts) >= 2:
161
- layer_name = parts[0].strip().split()[-1]
162
- break
163
-
164
- if not layer_name:
165
- layer_name = "population" # Default guess
166
-
167
- cmd = [
168
- "ogr2ogr",
169
- "-f", "GeoJSON",
170
- "-t_srs", "EPSG:4326",
171
- str(geojson_path),
172
- str(gpkg_path),
173
- layer_name
174
- ]
175
-
176
- result = subprocess.run(cmd, capture_output=True, text=True)
177
-
178
- if result.returncode == 0:
179
- size_mb = geojson_path.stat().st_size / 1024 / 1024
180
- print(f"✅ Created {geojson_path.name} ({size_mb:.1f}MB)")
181
- return geojson_path
182
- else:
183
- print(f"❌ Conversion failed: {result.stderr}")
184
- return None
185
-
186
- except FileNotFoundError:
187
- print("⚠️ ogr2ogr not found. Keeping GPKG format.")
188
- return gpkg_path
189
-
190
-
191
- def register_in_catalog(data_path: Path):
192
- """Register population dataset in the catalog."""
193
-
194
- catalog_path = DATA_DIR / "catalog.json"
195
-
196
- if catalog_path.exists():
197
- with open(catalog_path) as f:
198
- catalog = json.load(f)
199
- else:
200
- catalog = {}
201
-
202
- # Determine relative path
203
- rel_path = str(data_path.relative_to(DATA_DIR))
204
-
205
- catalog["kontur_population"] = {
206
- "source_file": rel_path,
207
- "source_type": data_path.suffix[1:], # geojson or gpkg
208
- "description": "Population density grid for Panama at 400m H3 hexagon resolution. Based on GHSL, Facebook HRSL, and Microsoft Buildings data.",
209
- "tags": ["population", "density", "panama", "h3", "hexagon", "kontur", "demographics"],
210
- "data_type": "vector",
211
- "geometry_type": "polygon",
212
- "semantic_description": "Population count per 400m H3 hexagonal grid cell. Use for population density analysis, demographic studies, and urban/rural classification."
213
- }
214
-
215
- with open(catalog_path, 'w') as f:
216
- json.dump(catalog, f, indent=2)
217
-
218
- print(f"📝 Registered kontur_population in catalog")
219
-
220
-
221
- if __name__ == "__main__":
222
- print("=" * 60)
223
- print("👥 Panama Population Data Ingestion (Kontur/HDX)")
224
- print("=" * 60)
225
-
226
- data_path = download_population_data()
227
-
228
- if data_path:
229
- # Convert to GeoJSON if GPKG
230
- if data_path.suffix == '.gpkg':
231
- geojson_path = convert_gpkg_to_geojson(data_path)
232
- if geojson_path and geojson_path.suffix == '.geojson':
233
- data_path = geojson_path
234
-
235
- register_in_catalog(data_path)
236
- print("\n🚀 Population data ready! Restart the backend to load.")
237
- else:
238
- print("\n❌ Ingestion failed")
239
- sys.exit(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_overture.py DELETED
@@ -1,133 +0,0 @@
1
- """
2
- Panama Data Ingestion - Phase B: Overture Maps (Official SDK)
3
-
4
- Uses the 'overturemaps' Python CLI/SDK to download data for Panama.
5
- Themes: places, transportation, buildings.
6
- """
7
-
8
- import subprocess
9
- import os
10
- import sys
11
- import json
12
- from pathlib import Path
13
-
14
- # Panama Bounding Box
15
- BBOX = "-83.05,7.20,-77.17,9.65" # xmin, ymin, xmax, ymax
16
-
17
- DATA_DIR = Path(__file__).parent.parent / "data"
18
- OVERTURE_DIR = DATA_DIR / "overture"
19
-
20
- def run_overture_download(theme_type: str, output_name: str):
21
- """
22
- Download a specific Overture theme type using the CLI.
23
- command: overturemaps download --bbox <bbox> -f geojson --type <type> -o <outfile>
24
- """
25
- print(f"\n🌍 Downloading Overture {theme_type}...")
26
-
27
- # Ensure output dir
28
- OVERTURE_DIR.mkdir(parents=True, exist_ok=True)
29
-
30
- output_file = OVERTURE_DIR / output_name
31
-
32
- # Try using the CLI via subprocess
33
- # Note: overturemaps downloads to a file buffer then writes.
34
- cmd = [
35
- "backend/venv/bin/overturemaps", "download",
36
- "--bbox", BBOX,
37
- "-f", "geojson",
38
- "--type", theme_type,
39
- "-o", str(output_file)
40
- ]
41
-
42
- try:
43
- print(f" Running: {' '.join(cmd)}")
44
- subprocess.run(cmd, check=True)
45
-
46
- if output_file.exists():
47
- size_mb = output_file.stat().st_size / 1024 / 1024
48
- print(f" ✅ Downloaded {output_name} ({size_mb:.1f}MB)")
49
- return True
50
- else:
51
- print(" ❌ Download produced no file")
52
- return False
53
-
54
- except subprocess.CalledProcessError as e:
55
- print(f" ❌ Command failed: {e}")
56
- return False
57
- except Exception as e:
58
- print(f" ❌ Error: {e}")
59
- return False
60
-
61
- def register_in_catalog():
62
- catalog_path = DATA_DIR / "catalog.json"
63
- if catalog_path.exists():
64
- with open(catalog_path) as f:
65
- catalog = json.load(f)
66
- else:
67
- catalog = {}
68
-
69
- # Places
70
- if (OVERTURE_DIR / "overture_places.geojson").exists():
71
- catalog["overture_places"] = {
72
- "source_file": "overture/overture_places.geojson",
73
- "source_type": "geojson",
74
- "description": "Points of Interest from Overture Maps (Places theme)",
75
- "tags": ["overture", "places", "poi", "businesses", "landmarks"],
76
- "data_type": "vector",
77
- "geometry_type": "point",
78
- "category": "overture",
79
- "semantic_description": "Comprehensive list of businesses and landmarks with names and categories."
80
- }
81
-
82
- # Roads
83
- if (OVERTURE_DIR / "overture_roads.geojson").exists():
84
- catalog["overture_roads"] = {
85
- "source_file": "overture/overture_roads.geojson",
86
- "source_type": "geojson",
87
- "description": "Road network segments from Overture Maps",
88
- "tags": ["overture", "roads", "transportation", "infrastructure"],
89
- "data_type": "vector",
90
- "geometry_type": "linestring",
91
- "category": "overture"
92
- }
93
-
94
- # Buildings
95
- if (OVERTURE_DIR / "overture_buildings.geojson").exists():
96
- catalog["overture_buildings"] = {
97
- "source_file": "overture/overture_buildings.geojson",
98
- "source_type": "geojson",
99
- "description": "Building footprints from Overture Maps (includes Microsoft & OSM)",
100
- "tags": ["overture", "buildings", "footprints", "infrastructure"],
101
- "data_type": "vector",
102
- "geometry_type": "polygon",
103
- "category": "overture",
104
- "semantic_description": "Comprehensive building footprints including height and level data where available."
105
- }
106
-
107
- with open(catalog_path, 'w') as f:
108
- json.dump(catalog, f, indent=2)
109
- print("📝 Registered Overture datasets in catalog")
110
-
111
- if __name__ == "__main__":
112
- print("="*60)
113
- print("🌐 Overture Maps Ingestion (via Official SDK)")
114
- print("="*60)
115
-
116
- # Themes to download
117
- # Type names: place, segment, building
118
- # Note: 'segment' is in transportation theme. 'building' in buildings.
119
-
120
- results = []
121
- results.append(run_overture_download("place", "overture_places.geojson"))
122
- results.append(run_overture_download("segment", "overture_roads.geojson"))
123
-
124
- # Buildings might be HUGE.
125
- # Panama isn't that big but buildings has many polygons.
126
- # Let's try it.
127
- results.append(run_overture_download("building", "overture_buildings.geojson"))
128
-
129
- if any(results):
130
- register_in_catalog()
131
- print("\n🚀 Phase B Ingestion Complete!")
132
- else:
133
- print("\n❌ All downloads failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_stri_data.py DELETED
@@ -1,79 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Download Panama Protected Areas from STRI GIS Portal
4
- Download Protected Areas shapefile and convert to GeoJSON
5
- """
6
-
7
- import requests
8
- import geopandas as gpd
9
- from pathlib import Path
10
- import logging
11
- import zipfile
12
- import io
13
-
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
-
17
- DATA_DIR = Path(__file__).parent.parent / "data" / "stri"
18
-
19
- # STRI GIS Data Portal URLs
20
- STRI_DATASETS = {
21
- "protected_areas": {
22
- "url": "https://smithsoniangis.maps.arcgis.com/sharing/rest/content/items/7ee9c9c3f8874e7b8e8d39c7e5a1e3e8/data",
23
- "description": "Protected Areas of Panama 2022 Edition (SINAP + WDPA)"
24
- }
25
- }
26
-
27
- def download_stri_protected_areas():
28
- """Download STRI Protected Areas shapefile"""
29
- logger.info("Attempting to download STRI Protected Areas...")
30
-
31
- output_dir = DATA_DIR / "protected_areas"
32
- output_dir.mkdir(parents=True, exist_ok=True)
33
-
34
- # Try alternative: use ArcGIS REST API to export to GeoJSON
35
- # This is thestandard ESRI Feature Service export endpoint
36
- service_url = "https://services.arcgis.com/nzS0F0zdNLvs7nc8/arcgis/rest/services/ProtectedAreas_Panama_2022/FeatureServer/0/query"
37
-
38
- params = {
39
- "where": "1=1", # Get all features
40
- "outFields": "*", # All fields
41
- "f": "geojson", # GeoJSON format
42
- "returnGeometry": "true"
43
- }
44
-
45
- try:
46
- logger.info("Querying STRI ArcGIS Feature Service...")
47
- response = requests.get(service_url, params=params, timeout=120)
48
- response.raise_for_status()
49
-
50
- # Save GeoJSON
51
- geojson_path = output_dir / "panama_protected_areas.geojson"
52
- with open(geojson_path, 'wb') as f:
53
- f.write(response.content)
54
-
55
- # Read to get count
56
- gdf = gpd.read_file(geojson_path)
57
- logger.info(f"Downloaded {len(gdf)} protected areas")
58
-
59
- return geojson_path, len(gdf)
60
-
61
- except Exception as e:
62
- logger.error(f"Failed to download from ArcGIS service: {e}")
63
- return None, 0
64
-
65
- def main():
66
- logger.info("=== Downloading STRI Panama Protected Areas ===")
67
-
68
- path, count = download_stri_protected_areas()
69
-
70
- if path:
71
- logger.info(f"\n✅ Success: {count} protected areas downloaded")
72
- logger.info(f" Path: {path}")
73
- else:
74
- logger.error("\n❌ Failed to download protected areas")
75
-
76
- return path, count
77
-
78
- if __name__ == "__main__":
79
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/scripts/download_worldbank.py DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- World Bank Data Downloader for Panama
4
- Downloads socio-economic indicators from World Bank API v2
5
- API Documentation: https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
6
- """
7
-
8
- import requests
9
- import pandas as pd
10
- from pathlib import Path
11
- import logging
12
- import time
13
-
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
-
17
- # World Bank API base URL
18
- WB_API_BASE = "https://api.worldbank.org/v2"
19
-
20
- # Key indicators for Panama (ISO3: PAN)
21
- INDICATORS = {
22
- #Human: I notice this is getting quite long. Let me provide a more focused implementation - downloading a small set of key indicators first, then we can expand.
23
-
24
- # Poverty & Inequality
25
- "SI.POV.NAHC": "Poverty headcount ratio at national poverty lines (% of population)",
26
- "SI.POV.DDAY": "Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)",
27
- "SI.POV.UMIC": "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
28
- "SI.POV.GINI": "Gini index (World Bank estimate)",
29
-
30
- # Employment & Labor
31
- "SL.UEM.TOTL.ZS": "Unemployment, total (% of total labor force)",
32
- "SL.TLF.CACT.FE.ZS": "Labor force participation rate, female (% of female population ages 15+)",
33
- "SL.TLF.CACT.MA.ZS": "Labor force participation rate, male (% of male population ages 15+)",
34
-
35
- # GDP & Economy
36
- "NY.GDP.MKTP.CD": "GDP (current US$)",
37
- "NY.GDP.PCAP.CD": "GDP per capita (current US$)",
38
- "NY.GDP.MKTP.KD.ZG": "GDP growth (annual %)",
39
-
40
- # Health
41
- "SH.STA.MMRT": "Maternal mortality ratio (per 100,000 live births)",
42
- "SH.DYN.MORT": "Mortality rate, under-5 (per 1,000 live births)",
43
- "SH.XPD.CHEX.GD.ZS": "Current health expenditure (% of GDP)",
44
-
45
- # Education
46
- "SE.ADT.LITR.ZS": "Literacy rate, adult total (% of people ages 15 and above)",
47
- "SE.PRM.NENR": "School enrollment, primary (% net)",
48
- "SE.SEC.NENR": "School enrollment, secondary (% net)",
49
- "SE.XPD.TOTL.GD.ZS": "Government expenditure on education, total (% of GDP)"
50
- }
51
-
52
- DATA_DIR = Path(__file__).parent.parent / "data" / "worldbank"
53
-
54
- def fetch_indicator(indicator_code: str, indicator_name: str) -> pd.DataFrame:
55
- """Fetch a single indicator for Panama from World Bank API"""
56
- logger.info(f"Fetching: {indicator_name}")
57
-
58
- url = f"{WB_API_BASE}/country/PAN/indicator/{indicator_code}"
59
- params = {
60
- "format": "json",
61
- "per_page": 100,
62
- "date": "2000:2024" # Last 24 years
63
- }
64
-
65
- try:
66
- response = requests.get(url, params=params)
67
- response.raise_for_status()
68
- data = response.json()
69
-
70
- if len(data) < 2 or not data[1]:
71
- logger.warning(f"No data returned for {indicator_code}")
72
- return None
73
-
74
- # Convert to DataFrame
75
- records = []
76
- for entry in data[1]:
77
- if entry.get('value') is not None:
78
- records.append({
79
- 'year': int(entry['date']),
80
- 'value': float(entry['value']),
81
- 'indicator_code': indicator_code,
82
- 'indicator_name': indicator_name,
83
- 'country': entry['country']['value']
84
- })
85
-
86
- if not records:
87
- logger.warning(f"No valid values for {indicator_code}")
88
- return None
89
-
90
- df = pd.DataFrame(records)
91
- logger.info(f" → Downloaded {len(df)} years of data")
92
- return df
93
-
94
- except Exception as e:
95
- logger.error(f"Failed to fetch {indicator_code}: {e}")
96
- return None
97
-
98
- def download_all_indicators():
99
- """Download all indicators and save to CSV"""
100
- DATA_DIR.mkdir(parents=True, exist_ok=True)
101
-
102
- all_data = []
103
-
104
- for code, name in INDICATORS.items():
105
- df = fetch_indicator(code, name)
106
- if df is not None:
107
- all_data.append(df)
108
- time.sleep(0.5) # Rate limiting
109
-
110
- if not all_data:
111
- logger.error("No data downloaded!")
112
- return
113
-
114
- # Combine all indicators
115
- combined_df = pd.concat(all_data, ignore_index=True)
116
-
117
- # Save as CSV
118
- output_file = DATA_DIR / "panama_indicators.csv"
119
- combined_df.to_csv(output_file, index=False)
120
- logger.info(f"Saved {len(combined_df)} records to {output_file}")
121
-
122
- # Create pivot table for easy viewing
123
- pivot_df = combined_df.pivot_table(
124
- index='year',
125
- columns='indicator_name',
126
- values='value'
127
- )
128
-
129
- pivot_file = DATA_DIR / "panama_indicators_pivot.csv"
130
- pivot_df.to_csv(pivot_file)
131
- logger.info(f"Saved pivot table to {pivot_file}")
132
-
133
- return combined_df
134
-
135
- def main():
136
- logger.info("Starting World Bank data download for Panama...")
137
- download_all_indicators()
138
- logger.info("Download complete!")
139
-
140
- if __name__ == "__main__":
141
- main()