File size: 4,529 Bytes
4851501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
"""
Extract additional features from existing Overture Maps data
- Hospitals, clinics, pharmacies
- Government offices
- Tourist attractions
- Restaurants, hotels
"""

import geopandas as gpd
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

DATA_DIR = Path(__file__).parent.parent / "data"
OVERTURE_DIR = DATA_DIR / "overture"
OUTPUT_DIR = DATA_DIR / "enriched"

def extract_healthcare():
    """Extract healthcare facilities from Overture places"""
    logger.info("Extracting healthcare facilities...")
    
    places_path = OVERTURE_DIR / "places.geojson"
    gdf = gpd.read_file(places_path)
    
    # Filter for healthcare
    healthcare_categories = ['hospital', 'clinic', 'pharmacy', 'doctor', 'dentist', 'health']
    healthcare_gdf = gdf[gdf['category'].str.contains('|'.join(healthcare_categories), case=False, na=False)]
    
    logger.info(f"Found {len(healthcare_gdf)} healthcare facilities")
    
    # Save
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    output_path = OUTPUT_DIR / "healthcare_facilities.geojson"
    healthcare_gdf.to_file(output_path, driver='GeoJSON')
    
    return output_path, len(healthcare_gdf)

def extract_tourism():
    """Extract tourist attractions"""
    logger.info("Extracting tourist attractions...")
    
    places_path = OVERTURE_DIR / "places.geojson"
    gdf = gpd.read_file(places_path)
    
    # Filter for tourism
    tourism_categories = ['museum', 'monument', 'attraction', 'park', 'beach', 'viewpoint', 'zoo', 'aquarium']
    tourism_gdf = gdf[gdf['category'].str.contains('|'.join(tourism_categories), case=False, na=False)]
    
    logger.info(f"Found {len(tourism_gdf)} tourist attractions")
    
    # Save
    output_path = OUTPUT_DIR / "tourist_attractions.geojson"
    tourism_gdf.to_file(output_path, driver='GeoJSON')
    
    return output_path, len(tourism_gdf)

def extract_accommodation():
    """Extract hotels and accommodation"""
    logger.info("Extracting accommodation...")
    
    places_path = OVERTURE_DIR / "places.geojson"
    gdf = gpd.read_file(places_path)
    
    # Filter for accommodation
    accommodation_categories = ['hotel', 'hostel', 'motel', 'resort', 'lodge', 'guest_house']
    accommodation_gdf = gdf[gdf['category'].str.contains('|'.join(accommodation_categories), case=False, na=False)]
    
    logger.info(f"Found {len(accommodation_gdf)} accommodation facilities")
    
    # Save
    output_path = OUTPUT_DIR / "accommodation.geojson"
    accommodation_gdf.to_file(output_path, driver='GeoJSON')
    
    return output_path, len(accommodation_gdf)

def extract_restaurants():
    """Extract restaurants and food services"""
    logger.info("Extracting restaurants...")
    
    places_path = OVERTURE_DIR / "places.geojson"
    gdf = gpd.read_file(places_path)
    
    # Filter for restaurants
    restaurant_categories = ['restaurant', 'cafe', 'bar', 'fast_food', 'food_court']
    restaurant_gdf = gdf[gdf['category'].str.contains('|'.join(restaurant_categories), case=False, na=False)]
    
    logger.info(f"Found {len(restaurant_gdf)} restaurants/cafes")
    
    # Save
    output_path = OUTPUT_DIR / "restaurants.geojson"
    restaurant_gdf.to_file(output_path, driver='GeoJSON')
    
    return output_path, len(restaurant_gdf)

def main():
    logger.info("=== Extracting features from Overture data ===")
    
    results = []
    
    try:
        path, count = extract_healthcare()
        results.append({"dataset": "healthcare_facilities", "count": count})
    except Exception as e:
        logger.error(f"Failed healthcare extraction: {e}")
    
    try:
        path, count = extract_tourism()
        results.append({"dataset": "tourist_attractions", "count": count})
    except Exception as e:
        logger.error(f"Failed tourism extraction: {e}")
    
    try:
        path, count = extract_accommodation()
        results.append({"dataset": "accommodation", "count": count})
    except Exception as e:
        logger.error(f"Failed accommodation extraction: {e}")
    
    try:
        path, count = extract_restaurants()
        results.append({"dataset": "restaurants", "count": count})
    except Exception as e:
        logger.error(f"Failed restaurant extraction: {e}")
    
    logger.info("\n=== Extraction Summary ===")
    for result in results:
        logger.info(f"  {result['dataset']}: {result['count']} features")
    
    return results

if __name__ == "__main__":
    main()