|
|
|
|
|
""" |
|
|
STRI GIS Portal Catalog Scraper |
|
|
|
|
|
Discovers and catalogs datasets from the Smithsonian Tropical Research Institute |
|
|
GIS Portal using the ArcGIS Online API. |
|
|
""" |
|
|
|
|
|
import requests |
|
|
import json |
|
|
from pathlib import Path |
|
|
import logging |
|
|
from datetime import datetime |
|
|
from typing import Dict, List, Optional |
|
|
import re |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
DATA_DIR = Path(__file__).parent.parent / "data" / "stri" |
|
|
METADATA_DIR = DATA_DIR / "metadata" |
|
|
|
|
|
|
|
|
STRI_ORG_ID = "nzS0F0zdNLvs7nc8" |
|
|
ARCGIS_BASE_URL = "https://www.arcgis.com/sharing/rest" |
|
|
|
|
|
|
|
|
HIGH_PRIORITY_KEYWORDS = [ |
|
|
"panama", "national", "country", "forest", "cover", "protected", "areas", |
|
|
"land use", "biodiversity", "climate", "water", "infrastructure", |
|
|
"administrative", "boundaries", "poverty", "population" |
|
|
] |
|
|
|
|
|
|
|
|
LOW_PRIORITY_KEYWORDS = [ |
|
|
"bci", "barro colorado", "island", "pena blanca", "site-specific", |
|
|
"trail", "sensor", "camera", "plot" |
|
|
] |
|
|
|
|
|
|
|
|
TEMPORAL_PATTERNS = [ |
|
|
r"\b(19\d{2}|20\d{2})\b", |
|
|
r"edition\s+(19\d{2}|20\d{2})", |
|
|
r"version\s+(19\d{2}|20\d{2})" |
|
|
] |
|
|
|
|
|
|
|
|
def search_stri_portal(query: str = "panama", num: int = 100, start: int = 1) -> Dict: |
|
|
""" |
|
|
Search the STRI GIS Portal using ArcGIS REST API |
|
|
|
|
|
Args: |
|
|
query: Search query string (default: "panama" for Panama-specific datasets) |
|
|
num: Number of results per page (max 100) |
|
|
start: Starting position |
|
|
|
|
|
Returns: |
|
|
JSON response with search results |
|
|
""" |
|
|
search_url = f"{ARCGIS_BASE_URL}/search" |
|
|
|
|
|
|
|
|
params = { |
|
|
"q": f'orgid:{STRI_ORG_ID} AND (panama OR panamá)', |
|
|
"f": "json", |
|
|
"num": num, |
|
|
"start": start, |
|
|
"sortField": "modified", |
|
|
"sortOrder": "desc" |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.get(search_url, params=params, timeout=30) |
|
|
response.raise_for_status() |
|
|
return response.json() |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to search portal: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
def get_item_details(item_id: str) -> Optional[Dict]: |
|
|
"""Get detailed metadata for a specific item""" |
|
|
details_url = f"{ARCGIS_BASE_URL}/content/items/{item_id}" |
|
|
|
|
|
params = {"f": "json"} |
|
|
|
|
|
try: |
|
|
response = requests.get(details_url, params=params, timeout=30) |
|
|
response.raise_for_status() |
|
|
return response.json() |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to get item {item_id}: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def extract_year_from_title(title: str) -> Optional[int]: |
|
|
"""Extract year from dataset title""" |
|
|
for pattern in TEMPORAL_PATTERNS: |
|
|
match = re.search(pattern, title, re.IGNORECASE) |
|
|
if match: |
|
|
year_str = match.group(1) if match.lastindex else match.group(0) |
|
|
try: |
|
|
return int(year_str) |
|
|
except ValueError: |
|
|
continue |
|
|
return None |
|
|
|
|
|
|
|
|
def calculate_priority_score(item: Dict) -> float: |
|
|
""" |
|
|
Calculate priority score for a dataset based on: |
|
|
- National vs site-specific coverage |
|
|
- Relevance keywords |
|
|
- Data type (prefer Feature Services) |
|
|
- Recency |
|
|
""" |
|
|
score = 50.0 |
|
|
|
|
|
title = item.get("title", "").lower() if item.get("title") else "" |
|
|
description = item.get("description", "").lower() if item.get("description") else "" |
|
|
tags = " ".join(item.get("tags", [])).lower() if item.get("tags") else "" |
|
|
item_type = item.get("type", "") |
|
|
|
|
|
combined_text = f"{title} {description} {tags}" |
|
|
|
|
|
|
|
|
for keyword in HIGH_PRIORITY_KEYWORDS: |
|
|
if keyword in combined_text: |
|
|
score += 5 |
|
|
|
|
|
|
|
|
for keyword in LOW_PRIORITY_KEYWORDS: |
|
|
if keyword in combined_text: |
|
|
score -= 15 |
|
|
|
|
|
|
|
|
if "Feature Service" in item_type: |
|
|
score += 20 |
|
|
elif "Map Service" in item_type: |
|
|
score += 10 |
|
|
|
|
|
|
|
|
if extract_year_from_title(title): |
|
|
score += 10 |
|
|
|
|
|
|
|
|
modified = item.get("modified", 0) |
|
|
if modified: |
|
|
|
|
|
years_since_2020 = (modified - 1577836800000) / (365.25 * 24 * 60 * 60 * 1000) |
|
|
score += min(years_since_2020 * 2, 10) |
|
|
|
|
|
return score |
|
|
|
|
|
|
|
|
def build_rest_endpoint(item: Dict) -> Optional[str]: |
|
|
"""Construct the REST endpoint URL for a Feature Service""" |
|
|
item_type = item.get("type", "") |
|
|
|
|
|
if "Feature Service" not in item_type: |
|
|
return None |
|
|
|
|
|
|
|
|
url = item.get("url") |
|
|
if url and "/FeatureServer" in url: |
|
|
|
|
|
if not url.endswith(("FeatureServer", "FeatureServer/")): |
|
|
return url |
|
|
return f"{url.rstrip('/')}/0" |
|
|
|
|
|
|
|
|
item_id = item.get("id") |
|
|
if item_id: |
|
|
return f"https://services.arcgis.com/{STRI_ORG_ID}/arcgis/rest/services/{item_id}/FeatureServer/0" |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def catalog_datasets(max_datasets: int = 100) -> List[Dict]: |
|
|
""" |
|
|
Scrape the STRI portal and build a prioritized catalog |
|
|
|
|
|
Args: |
|
|
max_datasets: Maximum number of datasets to retrieve |
|
|
|
|
|
Returns: |
|
|
List of dataset metadata dictionaries |
|
|
""" |
|
|
datasets = [] |
|
|
start = 1 |
|
|
batch_size = 100 |
|
|
|
|
|
logger.info("Scraping STRI GIS Portal...") |
|
|
|
|
|
while len(datasets) < max_datasets: |
|
|
logger.info(f"Fetching items {start} to {start + batch_size - 1}...") |
|
|
|
|
|
results = search_stri_portal(num=batch_size, start=start) |
|
|
|
|
|
if not results or "results" not in results: |
|
|
break |
|
|
|
|
|
items = results["results"] |
|
|
|
|
|
if not items: |
|
|
break |
|
|
|
|
|
for item in items: |
|
|
|
|
|
if "Feature Service" not in item.get("type", ""): |
|
|
continue |
|
|
|
|
|
|
|
|
priority = calculate_priority_score(item) |
|
|
|
|
|
|
|
|
year = extract_year_from_title(item.get("title", "")) |
|
|
|
|
|
|
|
|
rest_endpoint = build_rest_endpoint(item) |
|
|
|
|
|
dataset = { |
|
|
"id": item.get("id"), |
|
|
"title": item.get("title"), |
|
|
"description": item.get("description", ""), |
|
|
"type": item.get("type"), |
|
|
"tags": item.get("tags", []), |
|
|
"modified": item.get("modified"), |
|
|
"modified_date": datetime.fromtimestamp( |
|
|
item.get("modified", 0) / 1000 |
|
|
).isoformat() if item.get("modified") else None, |
|
|
"url": item.get("url"), |
|
|
"rest_endpoint": rest_endpoint, |
|
|
"year": year, |
|
|
"priority_score": round(priority, 2) |
|
|
} |
|
|
|
|
|
datasets.append(dataset) |
|
|
|
|
|
|
|
|
if start + batch_size > results.get("total", 0): |
|
|
break |
|
|
|
|
|
start += batch_size |
|
|
|
|
|
|
|
|
datasets.sort(key=lambda x: x["priority_score"], reverse=True) |
|
|
|
|
|
logger.info(f"Found {len(datasets)} Feature Service datasets") |
|
|
|
|
|
return datasets[:max_datasets] |
|
|
|
|
|
|
|
|
def identify_temporal_groups(datasets: List[Dict]) -> Dict[str, List[Dict]]: |
|
|
""" |
|
|
Group datasets by base name to identify temporal series |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping base name to list of datasets with years |
|
|
""" |
|
|
temporal_groups = {} |
|
|
|
|
|
for dataset in datasets: |
|
|
if dataset["year"] is None: |
|
|
continue |
|
|
|
|
|
|
|
|
title = dataset["title"] |
|
|
base_name = re.sub(r'\b(19\d{2}|20\d{2})\b', '', title) |
|
|
base_name = re.sub(r'\s+', ' ', base_name).strip() |
|
|
base_name = re.sub(r'edition|version', '', base_name, flags=re.IGNORECASE).strip() |
|
|
|
|
|
if base_name not in temporal_groups: |
|
|
temporal_groups[base_name] = [] |
|
|
|
|
|
temporal_groups[base_name].append(dataset) |
|
|
|
|
|
|
|
|
temporal_groups = { |
|
|
k: sorted(v, key=lambda x: x["year"]) |
|
|
for k, v in temporal_groups.items() |
|
|
if len(v) > 1 |
|
|
} |
|
|
|
|
|
return temporal_groups |
|
|
|
|
|
|
|
|
def save_catalog(datasets: List[Dict], temporal_groups: Dict[str, List[Dict]]): |
|
|
"""Save catalog and temporal groups to JSON files""" |
|
|
METADATA_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
catalog_path = METADATA_DIR / "stri_catalog.json" |
|
|
with open(catalog_path, 'w') as f: |
|
|
json.dump({ |
|
|
"generated_at": datetime.now().isoformat(), |
|
|
"total_datasets": len(datasets), |
|
|
"datasets": datasets |
|
|
}, f, indent=2) |
|
|
|
|
|
logger.info(f"Saved catalog to {catalog_path}") |
|
|
|
|
|
|
|
|
if temporal_groups: |
|
|
temporal_path = METADATA_DIR / "stri_temporal_groups.json" |
|
|
with open(temporal_path, 'w') as f: |
|
|
json.dump({ |
|
|
"generated_at": datetime.now().isoformat(), |
|
|
"num_groups": len(temporal_groups), |
|
|
"groups": temporal_groups |
|
|
}, f, indent=2) |
|
|
|
|
|
logger.info(f"Saved {len(temporal_groups)} temporal groups to {temporal_path}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main execution""" |
|
|
logger.info("=== STRI GIS Portal Catalog Scraper ===") |
|
|
|
|
|
|
|
|
datasets = catalog_datasets(max_datasets=100) |
|
|
|
|
|
|
|
|
temporal_groups = identify_temporal_groups(datasets) |
|
|
|
|
|
|
|
|
save_catalog(datasets, temporal_groups) |
|
|
|
|
|
|
|
|
logger.info("\n" + "="*60) |
|
|
logger.info(f"✅ Cataloged {len(datasets)} datasets") |
|
|
logger.info(f"📊 Found {len(temporal_groups)} temporal dataset groups") |
|
|
|
|
|
if temporal_groups: |
|
|
logger.info("\nTemporal Groups:") |
|
|
for base_name, group in list(temporal_groups.items())[:5]: |
|
|
years = [d["year"] for d in group] |
|
|
logger.info(f" - {base_name}: {years}") |
|
|
|
|
|
logger.info("\nTop 10 Priority Datasets:") |
|
|
for i, dataset in enumerate(datasets[:10], 1): |
|
|
logger.info(f" {i}. [{dataset['priority_score']:.1f}] {dataset['title']}") |
|
|
|
|
|
logger.info("="*60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|