Spaces:
Sleeping
Sleeping
File size: 3,428 Bytes
acd8e16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
"""
NYC Taxi Small Dataset Loader
Creates a DuckDB database with sample taxi trip data for testing.
"""
import duckdb
import os
from datetime import datetime, timedelta
def create_database(db_path: str = "nyc_taxi_small.duckdb"):
"""Create a DuckDB database with sample taxi data."""
# Remove existing database if it exists
if os.path.exists(db_path):
os.remove(db_path)
# Connect to DuckDB
conn = duckdb.connect(db_path)
# Read and execute schema
schema_path = os.path.join(os.path.dirname(__file__), "schema.sql")
with open(schema_path, 'r') as f:
schema_sql = f.read()
conn.execute(schema_sql)
# Insert sample data
base_time = datetime(2023, 1, 1, 8, 0, 0)
# Sample trips data
trips_data = [
(1, base_time, base_time + timedelta(minutes=15), 1, 2.5, -73.9857, 40.7484, -73.9881, 40.7614, 12.50, 2.50, 15.00, "Credit", "CMT"),
(2, base_time + timedelta(minutes=30), base_time + timedelta(minutes=45), 2, 1.8, -73.9857, 40.7484, -73.9881, 40.7614, 8.50, 1.70, 10.20, "Cash", "VTS"),
(3, base_time + timedelta(hours=1), base_time + timedelta(hours=1, minutes=20), 1, 4.2, -73.9857, 40.7484, -73.9881, 40.7614, 18.00, 3.60, 21.60, "Credit", "CMT"),
(4, base_time + timedelta(hours=2), base_time + timedelta(hours=2, minutes=10), 3, 0.9, -73.9857, 40.7484, -73.9881, 40.7614, 6.00, 1.20, 7.20, "Credit", "VTS"),
(5, base_time + timedelta(hours=3), base_time + timedelta(hours=3, minutes=25), 1, 3.1, -73.9857, 40.7484, -73.9881, 40.7614, 14.50, 2.90, 17.40, "Cash", "CMT"),
(6, base_time + timedelta(hours=4), base_time + timedelta(hours=4, minutes=12), 2, 2.3, -73.9857, 40.7484, -73.9881, 40.7614, 11.00, 2.20, 13.20, "Credit", "VTS"),
(7, base_time + timedelta(hours=5), base_time + timedelta(hours=5, minutes=18), 1, 1.5, -73.9857, 40.7484, -73.9881, 40.7614, 7.50, 1.50, 9.00, "Credit", "CMT"),
(8, base_time + timedelta(hours=6), base_time + timedelta(hours=6, minutes=22), 4, 5.8, -73.9857, 40.7484, -73.9881, 40.7614, 25.00, 5.00, 30.00, "Credit", "VTS"),
(9, base_time + timedelta(hours=7), base_time + timedelta(hours=7, minutes=8), 1, 0.7, -73.9857, 40.7484, -73.9881, 40.7614, 5.50, 1.10, 6.60, "Cash", "CMT"),
(10, base_time + timedelta(hours=8), base_time + timedelta(hours=8, minutes=35), 2, 6.2, -73.9857, 40.7484, -73.9881, 40.7614, 28.00, 5.60, 33.60, "Credit", "VTS"),
]
# Sample zones data
zones_data = [
(1, "Manhattan", "Central Park", "Yellow Zone"),
(2, "Manhattan", "Times Square", "Yellow Zone"),
(3, "Brooklyn", "Williamsburg", "Boro Zone"),
(4, "Queens", "Astoria", "Boro Zone"),
(5, "Bronx", "Yankee Stadium", "Boro Zone"),
(6, "Staten Island", "St. George", "Boro Zone"),
]
# Insert trips data
conn.executemany(
"INSERT INTO trips VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
trips_data
)
# Insert zones data
conn.executemany(
"INSERT INTO zones VALUES (?, ?, ?, ?)",
zones_data
)
conn.close()
print(f"Created database: {db_path}")
return db_path
def load_data(db_path: str = "nyc_taxi_small.duckdb"):
"""Load data into the database - wrapper for create_database."""
return create_database(db_path)
if __name__ == "__main__":
create_database()
|