GeoQuery / backend /services /data_loader.py
GerardCB's picture
Deploy to Spaces (Final Clean)
4851501
"""
Data Loader Service for Panama Geographic Data
Loads GeoJSON files from the data/raw directory and provides
query capabilities for the LLM to search and filter features.
"""
import os
import json
from typing import List, Dict, Any, Optional
from functools import lru_cache
class PanamaDataLoader:
"""
Singleton service to load and query Panama geographic data.
Loads data once on first access and caches in memory.
"""
_instance = None
_data_loaded = False
# Data storage
admin0: List[Dict[str, Any]] = [] # Country
admin1: List[Dict[str, Any]] = [] # Provinces (13)
admin2: List[Dict[str, Any]] = [] # Districts (76)
admin3: List[Dict[str, Any]] = [] # Corregimientos (594)
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if not PanamaDataLoader._data_loaded:
self._load_data()
PanamaDataLoader._data_loaded = True
def _get_data_path(self) -> str:
"""Get the path to the data/raw directory."""
# Navigate from backend/services to project root
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(os.path.dirname(current_dir))
return os.path.join(project_root, "data", "raw")
def _load_geojson(self, filename: str) -> List[Dict[str, Any]]:
"""Load a GeoJSON file and return its features."""
filepath = os.path.join(self._get_data_path(), filename)
if not os.path.exists(filepath):
print(f"Warning: {filepath} not found")
return []
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
features = data.get('features', [])
print(f" Loaded {len(features)} features from {filename}")
return features
except Exception as e:
print(f"Error loading {filename}: {e}")
return []
def _load_data(self):
"""Load all GeoJSON data files."""
print("=" * 50)
print("Loading Panama Geographic Data...")
print("=" * 50)
self.admin0 = self._load_geojson("pan_admin0.geojson")
self.admin1 = self._load_geojson("pan_admin1.geojson")
self.admin2 = self._load_geojson("pan_admin2.geojson")
self.admin3 = self._load_geojson("pan_admin3.geojson")
total = len(self.admin0) + len(self.admin1) + len(self.admin2) + len(self.admin3)
print(f"Total features loaded: {total}")
print("=" * 50)
def get_schema_context(self) -> str:
"""Return schema description for LLM context."""
return """
Panama Geographic Data (HDX Administrative Boundaries):
1. admin0 (Country Level)
- adm0_name: "Panamá"
- adm0_pcode: "PA"
- area_sqkm: country area in square kilometers
- geometry: MultiPolygon
2. admin1 (Provinces - 13 total)
- adm1_name: Province name (e.g., "Bocas del Toro", "Panamá", "Colón")
- adm1_pcode: Province code (e.g., "PA01", "PA08")
- adm0_name: "Panamá"
- area_sqkm: province area
- center_lat, center_lon: centroid coordinates
- geometry: MultiPolygon
3. admin2 (Districts - 76 total)
- adm2_name: District name
- adm2_pcode: District code (e.g., "PA0101")
- adm1_name: Parent province name
- adm1_pcode: Parent province code
- area_sqkm: district area
- center_lat, center_lon: centroid coordinates
- geometry: MultiPolygon
4. admin3 (Corregimientos - 594 total)
- adm3_name: Corregimiento name
- adm3_pcode: Corregimiento code (e.g., "PA010101")
- adm2_name: Parent district name
- adm2_pcode: Parent district code
- adm1_name: Parent province name
- area_sqkm: corregimiento area
- center_lat, center_lon: centroid coordinates
- geometry: MultiPolygon
Notes:
- All geometries use WGS84 (EPSG:4326) coordinate system
- P-codes follow ISO 3166-2 format
- Valid as of 2021-10-20
"""
def get_data_citations(self, admin_levels: List[str]) -> List[str]:
"""Return citations for the queried data."""
citations = []
level_names = {
"admin0": "Panama Country Boundary",
"admin1": "Panama Provinces",
"admin2": "Panama Districts",
"admin3": "Panama Corregimientos"
}
for level in admin_levels:
if level in level_names:
citations.append(f"{level_names[level]} (HDX COD-AB, 2021)")
return citations if citations else ["Panama Administrative Boundaries (HDX COD-AB, 2021)"]
def search_by_name(
self,
name: str,
admin_level: Optional[str] = None,
limit: int = 50
) -> List[Dict[str, Any]]:
"""
Search for features by name (case-insensitive partial match).
Args:
name: Search term
admin_level: Optional filter ("admin1", "admin2", "admin3")
limit: Maximum results to return
"""
name_lower = name.lower()
results = []
levels_to_search = []
if admin_level:
levels_to_search = [(admin_level, getattr(self, admin_level, []))]
else:
levels_to_search = [
("admin1", self.admin1),
("admin2", self.admin2),
("admin3", self.admin3)
]
for level_name, features in levels_to_search:
for feature in features:
props = feature.get("properties", {})
# Check various name fields
for key in ["adm1_name", "adm2_name", "adm3_name", "adm0_name"]:
value = props.get(key, "")
if value and name_lower in value.lower():
results.append({
"level": level_name,
"feature": feature
})
break
if len(results) >= limit:
break
if len(results) >= limit:
break
return results
def get_all_provinces(self) -> List[Dict[str, Any]]:
"""Get all provinces (admin1)."""
return self.admin1
def get_all_districts(self, province_pcode: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all districts, optionally filtered by province."""
if province_pcode:
return [
f for f in self.admin2
if f.get("properties", {}).get("adm1_pcode") == province_pcode
]
return self.admin2
def get_all_corregimientos(
self,
district_pcode: Optional[str] = None,
province_pcode: Optional[str] = None
) -> List[Dict[str, Any]]:
"""Get all corregimientos, optionally filtered."""
results = self.admin3
if district_pcode:
results = [
f for f in results
if f.get("properties", {}).get("adm2_pcode") == district_pcode
]
elif province_pcode:
results = [
f for f in results
if f.get("properties", {}).get("adm1_pcode") == province_pcode
]
return results
def get_by_pcode(self, pcode: str) -> Optional[Dict[str, Any]]:
"""Get a feature by its P-code."""
pcode_upper = pcode.upper()
# Determine level by P-code length
if len(pcode_upper) == 2: # Country
for f in self.admin0:
if f.get("properties", {}).get("adm0_pcode") == pcode_upper:
return f
elif len(pcode_upper) == 4: # Province
for f in self.admin1:
if f.get("properties", {}).get("adm1_pcode") == pcode_upper:
return f
elif len(pcode_upper) == 6: # District
for f in self.admin2:
if f.get("properties", {}).get("adm2_pcode") == pcode_upper:
return f
elif len(pcode_upper) == 8: # Corregimiento
for f in self.admin3:
if f.get("properties", {}).get("adm3_pcode") == pcode_upper:
return f
return None
def to_geojson(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Convert a list of features to a GeoJSON FeatureCollection."""
# Handle both raw features and wrapped results from search
clean_features = []
for f in features:
if "feature" in f:
clean_features.append(f["feature"])
else:
clean_features.append(f)
return {
"type": "FeatureCollection",
"features": clean_features
}
# Singleton instance
_data_loader: Optional[PanamaDataLoader] = None
def get_data_loader() -> PanamaDataLoader:
"""Get the singleton data loader instance."""
global _data_loader
if _data_loader is None:
_data_loader = PanamaDataLoader()
return _data_loader