| { | |
| "$schema": "http://json-schema.org/draft-07/schema#", | |
| "title": "GeoQuery Data Catalog Entry", | |
| "description": "Schema for dataset metadata in the GeoQuery platform catalog", | |
| "type": "object", | |
| "required": [ | |
| "path", | |
| "columns", | |
| "category", | |
| "format" | |
| ], | |
| "properties": { | |
| "path": { | |
| "type": "string", | |
| "description": "Relative path to the data file from the data directory" | |
| }, | |
| "description": { | |
| "type": "string", | |
| "description": "Auto-generated basic description (e.g., 'Data from hdx/health.geojson')" | |
| }, | |
| "semantic_description": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "description": "LLM-generated rich description explaining the dataset's contents and use cases" | |
| }, | |
| "tags": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "Searchable tags for categorization (e.g., ['health', 'facilities', 'infrastructure'])" | |
| }, | |
| "data_type": { | |
| "type": "string", | |
| "enum": [ | |
| "static", | |
| "semi-static", | |
| "realtime" | |
| ], | |
| "description": "How frequently the data changes", | |
| "default": "static" | |
| }, | |
| "update_frequency": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "enum": [ | |
| null, | |
| "yearly", | |
| "monthly", | |
| "weekly", | |
| "daily", | |
| "hourly", | |
| "realtime" | |
| ], | |
| "description": "Expected update frequency for the dataset" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "description": "List of column names in the dataset" | |
| }, | |
| "row_count": { | |
| "type": [ | |
| "integer", | |
| "null" | |
| ], | |
| "description": "Number of features/rows in the dataset" | |
| }, | |
| "category": { | |
| "type": "string", | |
| "description": "Source category (base, osm, hdx, inec, custom)" | |
| }, | |
| "format": { | |
| "type": "string", | |
| "enum": [ | |
| "geojson", | |
| "shapefile", | |
| "geoparquet", | |
| "csv" | |
| ], | |
| "description": "File format of the source data" | |
| }, | |
| "geometry_type": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "enum": [ | |
| null, | |
| "Point", | |
| "MultiPoint", | |
| "LineString", | |
| "MultiLineString", | |
| "Polygon", | |
| "MultiPolygon" | |
| ], | |
| "description": "Type of geometries in the dataset" | |
| }, | |
| "bbox": { | |
| "type": [ | |
| "array", | |
| "null" | |
| ], | |
| "items": { | |
| "type": "number" | |
| }, | |
| "minItems": 4, | |
| "maxItems": 4, | |
| "description": "Bounding box [minLon, minLat, maxLon, maxLat]" | |
| }, | |
| "source": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "description": "Original source of the data (e.g., 'OpenStreetMap', 'INEC Census 2023')" | |
| }, | |
| "license": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "description": "Data license (e.g., 'ODbL', 'CC-BY-4.0', 'Public Domain')" | |
| }, | |
| "last_indexed": { | |
| "type": "string", | |
| "format": "date-time", | |
| "description": "ISO timestamp when the dataset was last indexed" | |
| }, | |
| "last_enriched": { | |
| "type": [ | |
| "string", | |
| "null" | |
| ], | |
| "format": "date-time", | |
| "description": "ISO timestamp when LLM enrichment was last run" | |
| } | |
| } | |
| } |