File size: 3,428 Bytes
acd8e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
NYC Taxi Small Dataset Loader
Creates a DuckDB database with sample taxi trip data for testing.
"""

import duckdb
import os
from datetime import datetime, timedelta


def create_database(db_path: str = "nyc_taxi_small.duckdb"):
    """Create a DuckDB database with sample taxi data."""
    
    # Remove existing database if it exists
    if os.path.exists(db_path):
        os.remove(db_path)
    
    # Connect to DuckDB
    conn = duckdb.connect(db_path)
    
    # Read and execute schema
    schema_path = os.path.join(os.path.dirname(__file__), "schema.sql")
    with open(schema_path, 'r') as f:
        schema_sql = f.read()
    
    conn.execute(schema_sql)
    
    # Insert sample data
    base_time = datetime(2023, 1, 1, 8, 0, 0)
    
    # Sample trips data
    trips_data = [
        (1, base_time, base_time + timedelta(minutes=15), 1, 2.5, -73.9857, 40.7484, -73.9881, 40.7614, 12.50, 2.50, 15.00, "Credit", "CMT"),
        (2, base_time + timedelta(minutes=30), base_time + timedelta(minutes=45), 2, 1.8, -73.9857, 40.7484, -73.9881, 40.7614, 8.50, 1.70, 10.20, "Cash", "VTS"),
        (3, base_time + timedelta(hours=1), base_time + timedelta(hours=1, minutes=20), 1, 4.2, -73.9857, 40.7484, -73.9881, 40.7614, 18.00, 3.60, 21.60, "Credit", "CMT"),
        (4, base_time + timedelta(hours=2), base_time + timedelta(hours=2, minutes=10), 3, 0.9, -73.9857, 40.7484, -73.9881, 40.7614, 6.00, 1.20, 7.20, "Credit", "VTS"),
        (5, base_time + timedelta(hours=3), base_time + timedelta(hours=3, minutes=25), 1, 3.1, -73.9857, 40.7484, -73.9881, 40.7614, 14.50, 2.90, 17.40, "Cash", "CMT"),
        (6, base_time + timedelta(hours=4), base_time + timedelta(hours=4, minutes=12), 2, 2.3, -73.9857, 40.7484, -73.9881, 40.7614, 11.00, 2.20, 13.20, "Credit", "VTS"),
        (7, base_time + timedelta(hours=5), base_time + timedelta(hours=5, minutes=18), 1, 1.5, -73.9857, 40.7484, -73.9881, 40.7614, 7.50, 1.50, 9.00, "Credit", "CMT"),
        (8, base_time + timedelta(hours=6), base_time + timedelta(hours=6, minutes=22), 4, 5.8, -73.9857, 40.7484, -73.9881, 40.7614, 25.00, 5.00, 30.00, "Credit", "VTS"),
        (9, base_time + timedelta(hours=7), base_time + timedelta(hours=7, minutes=8), 1, 0.7, -73.9857, 40.7484, -73.9881, 40.7614, 5.50, 1.10, 6.60, "Cash", "CMT"),
        (10, base_time + timedelta(hours=8), base_time + timedelta(hours=8, minutes=35), 2, 6.2, -73.9857, 40.7484, -73.9881, 40.7614, 28.00, 5.60, 33.60, "Credit", "VTS"),
    ]
    
    # Sample zones data
    zones_data = [
        (1, "Manhattan", "Central Park", "Yellow Zone"),
        (2, "Manhattan", "Times Square", "Yellow Zone"),
        (3, "Brooklyn", "Williamsburg", "Boro Zone"),
        (4, "Queens", "Astoria", "Boro Zone"),
        (5, "Bronx", "Yankee Stadium", "Boro Zone"),
        (6, "Staten Island", "St. George", "Boro Zone"),
    ]
    
    # Insert trips data
    conn.executemany(
        "INSERT INTO trips VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
        trips_data
    )
    
    # Insert zones data
    conn.executemany(
        "INSERT INTO zones VALUES (?, ?, ?, ?)",
        zones_data
    )
    
    conn.close()
    print(f"Created database: {db_path}")
    return db_path


def load_data(db_path: str = "nyc_taxi_small.duckdb"):
    """Load data into the database - wrapper for create_database."""
    return create_database(db_path)


if __name__ == "__main__":
    create_database()