# Import all the neccessary tools for EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


data = pd.read_csv("uber.csv")


data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


data.isna().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64


for column in data.columns:
    print(data[column].value_counts())

24238194    1
23286231    1
45197665    1
30631497    1
7869264     1
           ..
53467014    1
15557161    1
11971041    1
6135974     1
11951496    1
Name: Unnamed: 0, Length: 200000, dtype: int64
2015-05-07 19:52:06.0000003      1
2012-10-14 22:58:00.00000051     1
2013-09-06 10:59:00.00000086     1
2013-12-27 20:23:50.0000001      1
2010-07-22 18:55:00.000000151    1
                                ..
2010-06-28 11:17:41.0000005      1
2010-12-01 12:58:32.0000001      1
2013-05-12 21:10:21.0000003      1
2014-08-09 16:03:54.0000002      1
2010-05-15 04:08:00.00000076     1
Name: key, Length: 200000, dtype: int64
6.50      9684
4.50      8247
8.50      7521
5.70      5858
5.30      5838
          ... 
140.25       1
190.00       1
45.16        1
28.20        1
89.10        1
Name: fare_amount, Length: 1244, dtype: int64
2014-04-13 18:19:00 UTC    4
2010-03-14 12:00:00 UTC    4
2009-02-12 12:46:00 UTC    4
2011-02-18 18:55:00 UTC    3
2009-03-12 17:12:00 UTC    3
                          ..
2013-03-08 07:16:00 UTC    1
2013-05-17 21:33:31 UTC    1
2009-10-24 04:05:00 UTC    1
2013-05-16 16:12:00 UTC    1
2010-05-15 04:08:00 UTC    1
Name: pickup_datetime, Length: 196629, dtype: int64
 0.000000     3786
-73.137393      72
-73.982600      20
-73.987167      20
-73.982210      20
              ... 
-73.878872       1
-73.925593       1
-73.976158       1
-73.789910       1
-73.997124       1
Name: pickup_longitude, Length: 71066, dtype: int64
0.000000     3782
41.366138      72
40.774100      20
40.755900      20
40.774000      20
             ... 
40.645305       1
40.741753       1
40.747328       1
40.708407       1
40.725452       1
Name: pickup_latitude, Length: 83835, dtype: int64
 0.000000     3764
-73.137393      65
-73.980400      21
-73.991025      21
-73.978952      20
              ... 
-73.967661       1
-74.000016       1
-73.884682       1
-73.984076       1
-73.858957       1
Name: dropoff_longitude, Length: 76894, dtype: int64
0.000000     3758
41.366138      65
40.750207      21
40.756400      17
40.750322      16
             ... 
40.820070       1
40.656993       1
40.776530       1
40.783642       1
40.768793       1
Name: dropoff_latitude, Length: 90585, dtype: int64
1      138425
2       29428
5       14009
3        8881
4        4276
6        4271
0         709
208         1
Name: passenger_count, dtype: int64


data.fare_amount.sort_values()

111589    -52.00
98875     -52.00
164056    -50.50
89322     -49.57
92063     -23.70
           ...  
197493    230.00
71715     250.00
185325    275.00
4292      350.00
170081    499.00
Name: fare_amount, Length: 200000, dtype: float64


data.pickup_longitude.sort_values()

75851    -1340.648410
144253    -768.550000
4949      -748.016667
199936    -736.400000
103745    -736.216667
             ...     
68551       40.801212
184570      40.803672
185317      40.806012
51546       40.808425
91422       57.418457
Name: pickup_longitude, Length: 200000, dtype: float64


for column in ["pickup_longitude", "dropoff_longitude"]:
    data.drop(data.loc[(data[column] < -180) | (data[column] > 180)].index, inplace=True)
for column in ["pickup_latitude", "dropoff_latitude"]:
    data.drop(data.loc[(data[column] < -90) | (data[column] > 90)].index, inplace=True)
data.drop(data.loc[(data["passenger_count"] > 6) | (data["passenger_count"] == 0)].index, inplace=True)
data.drop(data.loc[(data["fare_amount"] <= 0) | (data["fare_amount"] > 200)].index, inplace=True)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)


data.rename(columns = {"Unnamed: 0":"id"}, inplace=True)
data.drop(["key"], axis=1, inplace=True)


data.head()


def pre_process(data):
    new_data = data.copy()
    year_list = []
    month_list = []
    day_list = []
    time_list = []
    distance_list = []
    i = 0
    while i < len(new_data.pickup_datetime):
        date = new_data.pickup_datetime[i].split(" ")[0]
        time = new_data.pickup_datetime[i].split(" ")[1]
        year = int(date.split("-")[0])
        month = int(date.split("-")[1])
        day = int(date.split("-")[2])
        time = int(time.split(":")[0])*3600 + int(time.split(":")[1])*60 + int(time.split(":")[2])
        x = abs(new_data.pickup_longitude[i] - new_data.dropoff_longitude[i])
        y = abs(new_data.pickup_latitude[i] - new_data.dropoff_latitude[i])
        distance = (x**2 + y**2)**0.5
        year_list.append(year)
        month_list.append(month)
        day_list.append(day)
        time_list.append(time)
        distance_list.append(distance)
        i += 1
    new_data["year"], new_data["month"], new_data["day"], new_data["time"], new_data["distance"] = year_list, month_list, day_list, time_list, distance_list
    new_data.drop(["pickup_datetime"], axis=1, inplace=True)
    return new_data


%%time
processed_data = pre_process(data)

CPU times: total: 17.8 s
Wall time: 17.8 s


processed_data.head()


processed_data.describe()


processed_data.loc[processed_data["fare_amount"] > 100]


fig, ax = plt.subplots(3, 2, figsize=(15,15))
ax[0, 0].hist(processed_data.passenger_count, color=["lightblue"])
ax[0, 0].set_xlabel("Number of Passengers")
ax[0, 0].set_ylabel("Frequency")
ax[0, 0].set_title("Passengers per Ride")

ax[0, 1].hist(processed_data.year, color=["lightblue"])
ax[0, 1].set_xlabel("Years")
ax[0, 1].set_ylabel("Frequency")
ax[0, 1].set_title("Rides per Year")

ax[1, 0].hist(processed_data.month, color=["lightblue"])
ax[1, 0].set_xlabel("Months")
ax[1, 0].set_ylabel("Frequency")
ax[1, 0].set_title("Rides per Month")

ax[1, 1].hist(processed_data.day, color=["lightblue"])
ax[1, 1].set_xlabel("Days")
ax[1, 1].set_ylabel("Frequency")
ax[1, 1].set_title("Rides per Day")

ax[2, 0].hist(processed_data.time, color=["lightblue"])
ax[2, 0].set_xlabel("Time")
ax[2, 0].set_ylabel("Frequency")
ax[2, 0].set_title("Rides frequency at a given Time")

ax[2, 1].hist(processed_data.fare_amount, color=["lightblue"])
ax[2, 1].set_xlabel("Fare Amount")
ax[2, 1].set_ylabel("Frequency")
ax[2, 1].set_title("Rides frequency at a given Fare Amount")

Text(0.5, 1.0, 'Rides frequency at a given Fare Amount')


fig, ax = plt.subplots(3, 2, figsize=(15,15))
ax[0, 0].scatter(processed_data.passenger_count, processed_data.fare_amount, color=["salmon"])
ax[0, 0].set_xlabel("Number of Passengers")
ax[0, 0].set_ylabel("Fare Amount")
ax[0, 0].set_title("Fare Amount vs Number of Passengers")

ax[0, 1].scatter(processed_data.year, processed_data.fare_amount, color=["salmon"])
ax[0, 1].set_xlabel("Years")
ax[0, 1].set_ylabel("Fare Amount")
ax[0, 1].set_title("Fare Amount vs Year")

ax[1, 0].scatter(processed_data.month, processed_data.fare_amount, color=["salmon"])
ax[1, 0].set_xlabel("Months")
ax[1, 0].set_ylabel("Fare Amount")
ax[1, 0].set_title("Fare Amount vs Month")

ax[1, 1].scatter(processed_data.day, processed_data.fare_amount, color=["salmon"])
ax[1, 1].set_xlabel("Days")
ax[1, 1].set_ylabel("Fare Amount")
ax[1, 1].set_title("Fare Amount vs Day")

ax[2, 0].scatter(processed_data.time, processed_data.fare_amount, color=["salmon"])
ax[2, 0].set_xlabel("Time")
ax[2, 0].set_ylabel("Fare Amount")
ax[2, 0].set_title("Fare Amount vs Time")

ax[2, 1].scatter(processed_data.distance, processed_data.fare_amount, color=["salmon"])
ax[2, 1].set_xlabel("Distance")
ax[2, 1].set_ylabel("Fare Amount")
ax[2, 1].set_title("Fare Amount vs Distance")

Text(0.5, 1.0, 'Fare Amount vs Distance')


from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(processed_data.drop(["fare_amount"], axis=1), processed_data["fare_amount"], test_size=0.2, random_state=7)


processed_data["fare_amount"]

0          7.5
1          7.7
2         12.9
3          5.3
4         16.0
          ... 
199244     3.0
199245     7.5
199246    30.9
199247    14.5
199248    14.1
Name: fare_amount, Length: 199249, dtype: float64


%%time
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(train_X, train_y)

CPU times: total: 2min 52s
Wall time: 2min 55s

RandomForestRegressor()


%%time
rfr_pred = rfr.predict(test_X)
rfr_score = rfr.score(test_X, test_y)
rfr_score

CPU times: total: 2.95 s
Wall time: 2.96 s

0.8179917412241723


from sklearn.model_selection import GridSearchCV
parameters = {"n_estimators": [100, 250, 500, 1000]}

grid_cv = GridSearchCV(estimator=rfr, param_grid=parameters, cv=5, verbose=3)


%%time
grid_cv.fit(train_X[:10000], train_y[:10000])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ..................n_estimators=100;, score=0.745 total time=   5.4s
[CV 2/5] END ..................n_estimators=100;, score=0.824 total time=   5.5s
[CV 3/5] END ..................n_estimators=100;, score=0.735 total time=   5.4s
[CV 4/5] END ..................n_estimators=100;, score=0.810 total time=   5.5s
[CV 5/5] END ..................n_estimators=100;, score=0.773 total time=   5.3s
[CV 1/5] END ..................n_estimators=250;, score=0.748 total time=  13.7s
[CV 2/5] END ..................n_estimators=250;, score=0.822 total time=  13.9s
[CV 3/5] END ..................n_estimators=250;, score=0.738 total time=  13.7s
[CV 4/5] END ..................n_estimators=250;, score=0.816 total time=  13.9s
[CV 5/5] END ..................n_estimators=250;, score=0.781 total time=  13.4s
[CV 1/5] END ..................n_estimators=500;, score=0.750 total time=  27.5s
[CV 2/5] END ..................n_estimators=500;, score=0.821 total time=  27.9s
[CV 3/5] END ..................n_estimators=500;, score=0.737 total time=  27.5s
[CV 4/5] END ..................n_estimators=500;, score=0.817 total time=  27.9s
[CV 5/5] END ..................n_estimators=500;, score=0.780 total time=  27.0s
[CV 1/5] END .................n_estimators=1000;, score=0.750 total time=  55.1s
[CV 2/5] END .................n_estimators=1000;, score=0.823 total time=  55.8s
[CV 3/5] END .................n_estimators=1000;, score=0.737 total time=  55.2s
[CV 4/5] END .................n_estimators=1000;, score=0.817 total time=  56.0s
[CV 5/5] END .................n_estimators=1000;, score=0.779 total time=  54.0s
CPU times: total: 9min 43s
Wall time: 9min 43s

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [100, 250, 500, 1000]}, verbose=3)


grid_cv.best_score_

0.7812236151900849


%%time
grid_cv2 = GridSearchCV(estimator=rfr, param_grid=parameters, cv=2, verbose=3)
grid_cv2.fit(train_X[:50000], train_y[:50000])

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END ..................n_estimators=100;, score=0.792 total time=  20.8s
[CV 2/2] END ..................n_estimators=100;, score=0.782 total time=  20.7s
[CV 1/2] END ..................n_estimators=250;, score=0.794 total time=  52.1s
[CV 2/2] END ..................n_estimators=250;, score=0.779 total time=  51.9s
[CV 1/2] END ..................n_estimators=500;, score=0.791 total time= 1.7min
[CV 2/2] END ..................n_estimators=500;, score=0.779 total time= 1.7min
[CV 1/2] END .................n_estimators=1000;, score=0.792 total time= 3.5min
[CV 2/2] END .................n_estimators=1000;, score=0.780 total time= 3.5min
CPU times: total: 13min 37s
Wall time: 13min 37s

GridSearchCV(cv=2, estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [100, 250, 500, 1000]}, verbose=3)


grid_cv2.best_score_

0.7871638937020132


%%time
grid_cv3 = GridSearchCV(estimator=rfr, param_grid={"n_estimators":[200]}, cv=2, verbose=3)
grid_cv3.fit(train_X, train_y)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV 1/2] END ..................n_estimators=200;, score=0.804 total time= 2.6min
[CV 2/2] END ..................n_estimators=200;, score=0.810 total time= 2.6min
CPU times: total: 10min 52s
Wall time: 10min 52s

GridSearchCV(cv=2, estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [200]}, verbose=3)


grid_cv3.best_score_

0.8067727579313185


grid_cv3.score(test_X, test_y)

0.8200484065493455


final_model = RandomForestRegressor(random_state=7, n_estimators=200)
final_model.fit(train_X, train_y)
final_model.score(test_X, test_y)

0.8196440323273088


scores = final_model.predict(test_X)
submission = pd.DataFrame(test_X["id"])
submission["predicted_fare_amount"] = scores


submission.head()

	Unnamed: 0	key	fare_amount	pickup_datetime	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count
0	24238194	2015-05-07 19:52:06.0000003	7.5	2015-05-07 19:52:06 UTC	-73.999817	40.738354	-73.999512	40.723217	1
1	27835199	2009-07-17 20:04:56.0000002	7.7	2009-07-17 20:04:56 UTC	-73.994355	40.728225	-73.994710	40.750325	1
2	44984355	2009-08-24 21:45:00.00000061	12.9	2009-08-24 21:45:00 UTC	-74.005043	40.740770	-73.962565	40.772647	1
3	25894730	2009-06-26 08:22:21.0000001	5.3	2009-06-26 08:22:21 UTC	-73.976124	40.790844	-73.965316	40.803349	3
4	17610152	2014-08-28 17:47:00.000000188	16.0	2014-08-28 17:47:00 UTC	-73.925023	40.744085	-73.973082	40.761247	5

	id	fare_amount	pickup_datetime	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count
0	24238194	7.5	2015-05-07 19:52:06 UTC	-73.999817	40.738354	-73.999512	40.723217	1
1	27835199	7.7	2009-07-17 20:04:56 UTC	-73.994355	40.728225	-73.994710	40.750325	1
2	44984355	12.9	2009-08-24 21:45:00 UTC	-74.005043	40.740770	-73.962565	40.772647	1
3	25894730	5.3	2009-06-26 08:22:21 UTC	-73.976124	40.790844	-73.965316	40.803349	3
4	17610152	16.0	2014-08-28 17:47:00 UTC	-73.925023	40.744085	-73.973082	40.761247	5

	id	fare_amount	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	year	month	day	time	distance
0	24238194	7.5	-73.999817	40.738354	-73.999512	40.723217	1	2015	5	7	71526	0.015140
1	27835199	7.7	-73.994355	40.728225	-73.994710	40.750325	1	2009	7	17	72296	0.022103
2	44984355	12.9	-74.005043	40.740770	-73.962565	40.772647	1	2009	8	24	78300	0.053109
3	25894730	5.3	-73.976124	40.790844	-73.965316	40.803349	3	2009	6	26	30141	0.016528
4	17610152	16.0	-73.925023	40.744085	-73.973082	40.761247	5	2014	8	28	64020	0.051031

	id	fare_amount	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	year	month	day	time	distance
count	1.992490e+05	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000	199249.000000
mean	2.771574e+07	11.359585	-72.505237	39.919757	-72.515467	39.924069	1.689449	2011.743662	6.283424	15.705700	50362.665384	0.202409
std	1.601412e+07	9.750854	10.438634	6.125905	10.399418	6.112450	1.305401	1.859080	3.438472	8.686948	23481.550273	3.721349
min	1.000000e+00	0.010000	-93.824668	-74.015515	-75.458979	-74.015750	1.000000	2009.000000	1.000000	1.000000	0.000000	0.000000
25%	1.382629e+07	6.000000	-73.992064	40.734797	-73.991409	40.733829	1.000000	2010.000000	3.000000	8.000000	33856.000000	0.012435
50%	2.775601e+07	8.500000	-73.981825	40.752583	-73.980095	40.753042	1.000000	2012.000000	6.000000	16.000000	52680.000000	0.021495
75%	4.156019e+07	12.500000	-73.967165	40.767155	-73.963664	40.767995	2.000000	2013.000000	9.000000	23.000000	70260.000000	0.038343
max	5.542357e+07	200.000000	40.808425	48.018760	40.831932	45.031598	6.000000	2015.000000	12.000000	31.000000	86399.000000	85.699229

	id	fare_amount	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	year	month	day	time	distance
2049	31333682	113.66	-73.951227	40.778753	-73.949938	40.778149	1	2014	11	1	31359	0.001423
5952	28138818	105.00	-73.752265	40.923303	-73.752270	40.923303	1	2011	5	6	2400	0.000005
6597	28579349	137.00	0.000000	0.000000	0.000000	0.000000	1	2013	5	3	36300	0.000000
9036	33046347	126.10	-73.788657	40.640643	-74.001350	41.048048	1	2011	6	13	56760	0.459584
11266	52300154	113.00	-74.468770	40.476630	-74.468772	40.476630	2	2013	12	6	8220	0.000002
...	...	...	...	...	...	...	...	...	...	...	...	...
187527	21519159	165.00	-73.219710	40.803600	-73.219702	40.803599	1	2012	8	17	45368	0.000008
190055	19562599	120.30	-73.788095	40.642330	-73.976730	40.931132	3	2012	9	3	1260	0.344949
193726	34209729	130.25	-73.982272	40.763447	-74.177182	40.695032	1	2013	11	22	47220	0.206568
195873	53659256	109.00	-73.984697	40.749896	-74.045293	40.973143	2	2014	2	2	17025	0.231325
195904	13085828	200.00	-73.952994	40.736298	-73.952994	40.736298	1	2010	8	19	60765	0.000000

Uber Ride Cost Regression¶

Problem Description¶

Workflow¶

1. Data Preparation¶

2. Exploratory Data Analysis¶

3. Feature Engineering¶

4. Model Selection¶

5. Hyper Parameter Tuning¶

6. Model Evaluation¶

7. Model Submission¶

	id	predicted_fare_amount
89094	47502953	10.4935
45791	39239031	4.8755
114037	51374191	16.3325
49701	26644000	8.4980
84190	41093728	6.1175