Hello! I'm JayaPrakash, and this is a part of a milestone project on my journey in learning machine learning. It is a regression problen where we need to determine the price of a buldozer with good accuracy. The data utilized is obtained from Kaggle (https://www.kaggle.com/c/bluebook-for-bulldozers).
Import the required libraries to be used in this notebook. Also define all the functions used here for the sake of an organized notebook.
# Import the essentials.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import the required models and metrics.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
# A Function which returns a DataFrame of missing values.
def is_missing(data):
missing_data = pd.DataFrame()
for label, content in data.items():
missing_data[f"{label}_is_missing"] = data[label].isna()
return missing_data
# A Function to process all the data in the required format for the ML models.
def preprocessing(original_data):
'''
Takes a DataFrame Object and converts object dtypes to categories, fills missing values and replaces SaleDate column with Year, month and day.
Returns a DataFrame object.
'''
data = original_data.copy()
for label, content in data.items():
if label == "saledate":
data["saledate"] = pd.to_datetime(data["saledate"])
data["saleday"] = data["saledate"].dt.day
data["salemonth"] = data["saledate"].dt.month
data["saleyear"] = data["saledate"].dt.year
data.drop(["saledate"], axis=1, inplace=True)
elif data[label].dtype in ("int64", "float64"):
data[label].fillna(data[label].median(), inplace=True)
elif data[label].dtype == "object":
data[label] = content.astype("category").cat.as_ordered()
data[label] = data[label].cat.codes +1
return data
# Reduce the dimentionality of the dataset.
def correlate(data, value):
'''
Takes a DataFrame and filters its features based on the required value of correlation with the target.
'''
corr = {}
correlated_features = []
for label in data.columns:
corr[label] = data[label].corr(data["SalePrice"])
if abs(corr[label]) > value:
correlated_features.append(label)
corr_data = data[correlated_features].copy()
return corr_data, correlated_features
# Define a new evaluation metric as required by the competition.
def rmsle(model, X, y):
y_preds = model.predict(X)
return {"rmsle": np.sqrt(mean_squared_log_error(y, y_preds))}
# A function to test a given model and return the required score and parameters.
def check(model, X_train, y_train, X_valid, y_valid):
model.fit(X_train, y_train)
train_rmsle = rmsle(model, X_train, y_train)
valid_rmsle = rmsle(model, X_valid, y_valid)
model_parameter = model.get_params()
result = {"train_rmsle": train_rmsle,
"train_score": model.score(X_train, y_train),
"valid_rmsle": valid_rmsle,
"valid_score": model.score(X_valid, y_valid),
"n_estimators": model_parameter["n_estimators"],
"min_samples_leaf": model_parameter["min_samples_leaf"],
"min_samples_split": model_parameter["min_samples_split"],
"max_depth": model_parameter["max_depth"],
"max_features": model_parameter["max_features"]}
return result
# A custom scorer for GridSearchCV
def rmsle_score(y_true, y_preds):
return np.sqrt(mean_squared_log_error(y_true, y_preds))
Import the data and clean it to a suitable format for fitting the models. Try to find ways to reduce the number of features in an iterative way, so as to promote experimentation in tuning the model.
# Read the unprocessed training data.
train_data = pd.read_csv("Train.csv", low_memory=False)
valid = pd.read_csv("TrainAndValid.csv", low_memory=False)
valid_data = valid[401125:].copy()
# Create a DataFrame to document missing values in the input DataFrame.
missing_train_data = is_missing(train_data)
missing_valid_data = is_missing(valid_data)
# Process the training data into a format suitable for the Random Forest Regressor Model.
processed_train_data = preprocessing(train_data)
processed_valid_data = preprocessing(valid_data)
# Dimensionality Reduction of features based on their correlation with the target variable.
corr_train_data, correlated_features = correlate(processed_train_data, 0.05)
corr_valid_data = processed_valid_data[correlated_features].copy()
# Add the missing values column to improve the model score. (As there are many missing data which has been filled in.)
is_missing_features = []
for feature in correlated_features:
is_missing_features.append(f"{feature}_is_missing")
final_train_data = corr_train_data.copy()
final_train_data[is_missing_features] = missing_train_data[is_missing_features]
final_valid_data = corr_valid_data.copy()
final_valid_data[is_missing_features] = missing_valid_data[is_missing_features]
# Define the Features and Target for various iterations
# Features with reduced Dimentionality
X_train_corr = corr_train_data.drop(["SalePrice"], axis=1)
y_train_corr = corr_train_data["SalePrice"]
X_valid_corr = corr_valid_data.drop(["SalePrice"], axis=1)
y_valid_corr = corr_valid_data["SalePrice"]
# Trial with added missing columns.
X_train_final = final_train_data.drop(["SalePrice"], axis=1)
y_train_final = final_train_data["SalePrice"]
X_valid_final = final_valid_data.drop(["SalePrice"], axis=1)
y_valid_final = final_valid_data["SalePrice"]
# Visualize the Correlation between the features using a heatmap (Not Necessary, but everyone loves heatmaps :)
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(processed_train_data.corr(),
annot = False,
linewidths=0.5,
fmt=".2f",
cmap="YlGnBu")
Fit the data to the model and obtain a preliminary evaluation for the model's performance.
# Initialize the model.
rf = RandomForestRegressor(n_jobs=-1,
verbose=0,
random_state=42,
max_samples=10000)
%%time
# Preliminary testing of the model.
check(rf, X_train_corr, y_train_corr, X_valid_corr, y_valid_corr)
CPU times: total: 33.2 s Wall time: 4.9 s
{'train_rmsle': {'rmsle': 0.3214763111437421}, 'train_score': 0.7688186790936686, 'valid_rmsle': {'rmsle': 0.3912055181913352}, 'valid_score': 0.6411732838970292, 'n_estimators': 100, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_depth': None, 'max_features': 'auto'}
%%time
check(rf, X_train_final, y_train_final, X_valid_final, y_valid_final)
CPU times: total: 52.8 s Wall time: 12.4 s
{'train_rmsle': {'rmsle': 0.3213505004225222}, 'train_score': 0.7689366305645509, 'valid_rmsle': {'rmsle': 0.3900450726306095}, 'valid_score': 0.6436302560623703, 'n_estimators': 100, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_depth': None, 'max_features': 'auto'}
Define a grid search iteration (Evaluated on its performance) to determine the best parameters for the model. Unable to proceed as my system isn't capable of efficiently tuning the model. :(
rf2 = RandomForestRegressor(random_state=42,
n_estimators=1000)
%%time
check(rf2, X_train_corr, y_train_corr, X_valid_corr, y_valid_corr)
%%time
check(rf2, X_train_final, y_train_final, X_valid_final, y_valid_final)
Final output result of 0.39 Root Mean Square Log Error from the Random Forest Regressor is around 90th place in the Kaggle leaderboards. Bronze medal obtained!