-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathconvert_csv.py
More file actions
163 lines (134 loc) · 5.77 KB
/
convert_csv.py
File metadata and controls
163 lines (134 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import csv
import json
import urllib.request
import os
import struct
# NYC TLC Yellow Taxi Trip Data
PARQUET_FILE = "yellow_tripdata_2023-01.parquet"
DATASET_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
def read_parquet_simple(filepath, target_points=200_000):
"""
Simple parquet reader without pyarrow - reads the raw data
For proper parquet support, uses pyarrow if available
"""
try:
import pyarrow.parquet as pq
import pandas as pd
print(f"Reading parquet file with pyarrow...")
table = pq.read_table(filepath)
df = table.to_pandas()
return df
except ImportError:
print("pyarrow not available, trying pandas directly...")
try:
import pandas as pd
df = pd.read_parquet(filepath)
return df
except:
return None
def download_and_convert():
"""Download NYC taxi data and extract 200k points"""
target_points = 200_000
# Check if we need to download
if not os.path.exists(PARQUET_FILE):
print(f"Downloading dataset from {DATASET_URL}...")
urllib.request.urlretrieve(DATASET_URL, PARQUET_FILE)
print("Download complete!")
else:
print(f"Using existing {PARQUET_FILE}")
df = read_parquet_simple(PARQUET_FILE, target_points)
if df is None:
print("Could not read parquet. Generating synthetic data instead...")
df = None # Will use synthetic generation below
# NYC bounding box for filtering
min_lon, max_lon = -74.3, -73.7
min_lat, max_lat = 40.5, 40.95
data = []
point_id = 0
if df is not None:
print(f"Dataset has {len(df)} rows, columns: {list(df.columns)}")
# Find the coordinate columns (they changed format over time)
lon_col = lat_col = None
for col in df.columns:
col_lower = str(col).lower()
if 'pickup' in col_lower and 'lon' in col_lower:
lon_col = col
if 'pickup' in col_lower and 'lat' in col_lower:
lat_col = col
if lon_col is not None and lat_col is not None:
print(f"Using columns: {lon_col}, {lat_col}")
# Extract coordinates
for idx, row in df.iterrows():
if point_id >= target_points:
break
try:
lon = float(row[lon_col])
lat = float(row[lat_col])
# Filter valid NYC coordinates
if min_lon <= lon <= max_lon and min_lat <= lat <= max_lat:
data.extend([lat, lon, float(point_id), 1.0])
point_id += 1
except (ValueError, TypeError):
continue
if idx % 100000 == 0:
print(f"Processed {idx} rows, got {point_id} valid points...")
# If we didn't get enough points from the dataset, generate synthetic data
if point_id < target_points:
print(f"Generating {target_points - point_id} synthetic NYC taxi points...")
import random
random.seed(42)
# Generate realistic NYC taxi distribution
# Manhattan hotspots with weighted random distribution
hotspots = [
(-73.9857, 40.7484, 0.3), # Times Square area
(-73.9712, 40.7831, 0.15), # Upper East Side
(-74.0060, 40.7128, 0.2), # Financial District
(-73.9851, 40.7589, 0.15), # Midtown
(-73.9632, 40.7794, 0.1), # Central Park area
(-73.9442, 40.6782, 0.05), # Brooklyn
(-73.8700, 40.7769, 0.05), # Queens/LGA
]
for i in range(target_points):
# Pick a hotspot based on weight
r = random.random()
cumulative = 0
base_lon, base_lat, _ = hotspots[0]
for lon, lat, weight in hotspots:
cumulative += weight
if r <= cumulative:
base_lon, base_lat = lon, lat
break
# Add gaussian spread around hotspot
lat = base_lat + random.gauss(0, 0.015)
lon = base_lon + random.gauss(0, 0.015)
# Ensure within NYC bounds
if min_lon <= lon <= max_lon and min_lat <= lat <= max_lat:
data.extend([lat, lon, float(point_id), 1.0])
point_id += 1
# Save as binary file (Float32Array compatible: 4 floats per point)
with open('assets/taxi-data.bin', 'wb') as f:
# data is already [lat, lon, id, type] as floats
f.write(struct.pack(f'{len(data)}f', *data))
file_size = os.path.getsize('assets/taxi-data.bin') / (1024 * 1024)
print(f"Converted {len(data)//4} points to assets/taxi-data.bin ({file_size:.1f} MB)")
def convert_csv():
"""Fallback: convert from local CSV if available"""
data = []
with open('taxi-sample.csv', 'r') as f:
reader = csv.reader(f)
next(reader) # Skip header
for i, row in enumerate(reader):
try:
lon = float(row[5])
lat = float(row[6])
# Filter out invalid coordinates (0,0 is common in this dataset for errors)
if lon == 0 or lat == 0:
continue
data.extend([lat, lon, float(i), 1.0])
except ValueError:
continue
with open('assets/taxi-data.bin', 'wb') as f:
f.write(struct.pack(f'{len(data)}f', *data))
print(f"converted {len(data)//4} points to assets/taxi-data.bin")
if __name__ == "__main__":
download_and_convert()