-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_trip_dataset.py
73 lines (64 loc) · 2.65 KB
/
prepare_trip_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import glob
import os
import sys
import argparse
import pandas as pd
from utils import checkCSV, load_Topology, get_max_bounds_topology
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Prepare Trips dataset")
parser.add_argument(
'--path',
help='Path to trip csv files'
)
parser.add_argument(
'--topology',
help='Path to to topology csv file'
)
parser.add_argument(
'--name',
help='naming of the output files folder'
)
args = parser.parse_args()
if args.name is None:
print("Please set --name")
sys.exit()
if args.topology is None:
print("Please set --topology")
sys.exit()
else:
checkCSV(args.topology)
col_list = ['medallion', 'hack_license', 'vendor_id', 'rate_code', 'store_and_fwd_flag', 'pickup_datetime',
'dropoff_datetime', 'passenger_count', 'trip_time_in_secs', 'trip_distance', 'pickup_longitude',
'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
df_topology = load_Topology(args.topology)
# get [max_lat, min_lat, max_lon, min_lon]
max_bounds = get_max_bounds_topology(df_topology)
max_lat = max_bounds[0]
min_lat = max_bounds[1]
max_lon = max_bounds[2]
min_lon = max_bounds[3]
if args.path is None:
print("Please set --path")
sys.exit()
else:
p = 'data/trips/' + args.name + "/"
if not os.path.isfile(p + "trips.csv"):
for filename in os.listdir(args.path):
f = os.path.join(args.path, filename)
checkCSV(f)
print("load trip dataset...", filename)
df = pd.read_csv(f, skipinitialspace=True, usecols=col_list, low_memory=False)
print("drop unnecessary columns...", filename)
df.drop(['medallion', 'hack_license', 'dropoff_longitude', 'dropoff_latitude', 'dropoff_datetime', 'vendor_id', 'rate_code', 'store_and_fwd_flag', 'trip_time_in_secs', 'trip_distance'], axis=1, inplace=True)
print("get trips in topology bounds...")
df = df[df['pickup_longitude'].between(min_lon, max_lon)]
df = df[df['pickup_latitude'].between(min_lat, max_lat)]
df.sort_values("pickup_datetime", inplace=True)
os.makedirs(p, exist_ok=True)
df.to_csv(p + filename)
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join(p, "*.csv"))))
df.to_csv(p + "trips.csv")
for filename in os.listdir(p):
if filename != "trips.csv":
f = os.path.join(p, filename)
os.remove(f)