# Import the essentials.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the required models and metrics.
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer


# A Function which returns a DataFrame of missing values.
def is_missing(data):
    missing_data = pd.DataFrame()
    for label, content in data.items():
        missing_data[f"{label}_is_missing"] = data[label].isna()
    return missing_data

# A Function to process all the data in the required format for the ML models.
def preprocessing(original_data):
    '''
    Takes a DataFrame Object and converts object dtypes to categories, fills missing values and replaces SaleDate column with Year, month and day.
    Returns a DataFrame object.
    '''
    data = original_data.copy()
    for label, content in data.items():
        if label == "saledate":
            data["saledate"] = pd.to_datetime(data["saledate"])
            data["saleday"] = data["saledate"].dt.day
            data["salemonth"] = data["saledate"].dt.month
            data["saleyear"] = data["saledate"].dt.year
            data.drop(["saledate"], axis=1, inplace=True)
        elif data[label].dtype in ("int64", "float64"):
            data[label].fillna(data[label].median(), inplace=True)
        elif data[label].dtype == "object":
            data[label] = content.astype("category").cat.as_ordered()
            data[label] = data[label].cat.codes +1
    return data

# Reduce the dimentionality of the dataset.
def correlate(data, value):
    '''
    Takes a DataFrame and filters its features based on the required value of correlation with the target.
    '''
    corr = {}
    correlated_features = []
    for label in data.columns:
        corr[label] = data[label].corr(data["SalePrice"])
        if abs(corr[label]) > value:
            correlated_features.append(label)
    corr_data = data[correlated_features].copy()
    return corr_data, correlated_features

# Define a new evaluation metric as required by the competition.
def rmsle(model, X, y):
    y_preds = model.predict(X)
    return {"rmsle": np.sqrt(mean_squared_log_error(y, y_preds))}

# A function to test a given model and return the required score and parameters.
def check(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    train_rmsle = rmsle(model, X_train, y_train)
    valid_rmsle = rmsle(model, X_valid, y_valid)
    model_parameter = model.get_params()
    result = {"train_rmsle": train_rmsle,
              "train_score": model.score(X_train, y_train),
              "valid_rmsle": valid_rmsle,
              "valid_score": model.score(X_valid, y_valid),
              "n_estimators": model_parameter["n_estimators"],
              "min_samples_leaf": model_parameter["min_samples_leaf"],
              "min_samples_split": model_parameter["min_samples_split"],
              "max_depth": model_parameter["max_depth"],
              "max_features": model_parameter["max_features"]}
    return result

# A custom scorer for GridSearchCV
def rmsle_score(y_true, y_preds):
    return np.sqrt(mean_squared_log_error(y_true, y_preds))


# Read the unprocessed training data.
train_data = pd.read_csv("Train.csv", low_memory=False)
valid = pd.read_csv("TrainAndValid.csv", low_memory=False)
valid_data = valid[401125:].copy()


# Create a DataFrame to document missing values in the input DataFrame.
missing_train_data = is_missing(train_data)
missing_valid_data = is_missing(valid_data)

# Process the training data into a format suitable for the Random Forest Regressor Model.
processed_train_data = preprocessing(train_data)
processed_valid_data = preprocessing(valid_data)

# Dimensionality Reduction of features based on their correlation with the target variable.
corr_train_data, correlated_features = correlate(processed_train_data, 0.05)
corr_valid_data = processed_valid_data[correlated_features].copy()

# Add the missing values column to improve the model score. (As there are many missing data which has been filled in.)
is_missing_features = []
for feature in correlated_features:
    is_missing_features.append(f"{feature}_is_missing")
final_train_data = corr_train_data.copy()
final_train_data[is_missing_features] = missing_train_data[is_missing_features]
final_valid_data = corr_valid_data.copy()
final_valid_data[is_missing_features] = missing_valid_data[is_missing_features]

# Define the Features and Target for various iterations
# Features with reduced Dimentionality
X_train_corr = corr_train_data.drop(["SalePrice"], axis=1)
y_train_corr = corr_train_data["SalePrice"]
X_valid_corr = corr_valid_data.drop(["SalePrice"], axis=1)
y_valid_corr = corr_valid_data["SalePrice"]

# Trial with added missing columns.
X_train_final = final_train_data.drop(["SalePrice"], axis=1)
y_train_final = final_train_data["SalePrice"]
X_valid_final = final_valid_data.drop(["SalePrice"], axis=1)
y_valid_final = final_valid_data["SalePrice"]


# Visualize the Correlation between the features using a heatmap (Not Necessary, but everyone loves heatmaps :)
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(processed_train_data.corr(),
                 annot = False,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu")


# Initialize the model.
rf = RandomForestRegressor(n_jobs=-1,
                           verbose=0, 
                           random_state=42,
                           max_samples=10000)


%%time
# Preliminary testing of the model.
check(rf, X_train_corr, y_train_corr, X_valid_corr, y_valid_corr)

CPU times: total: 33.2 s
Wall time: 4.9 s

{'train_rmsle': {'rmsle': 0.3214763111437421},
 'train_score': 0.7688186790936686,
 'valid_rmsle': {'rmsle': 0.3912055181913352},
 'valid_score': 0.6411732838970292,
 'n_estimators': 100,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'max_depth': None,
 'max_features': 'auto'}


%%time
check(rf, X_train_final, y_train_final, X_valid_final, y_valid_final)

CPU times: total: 52.8 s
Wall time: 12.4 s

{'train_rmsle': {'rmsle': 0.3213505004225222},
 'train_score': 0.7689366305645509,
 'valid_rmsle': {'rmsle': 0.3900450726306095},
 'valid_score': 0.6436302560623703,
 'n_estimators': 100,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'max_depth': None,
 'max_features': 'auto'}


rf2 = RandomForestRegressor(random_state=42,
                            n_estimators=1000)


%%time
check(rf2, X_train_corr, y_train_corr, X_valid_corr, y_valid_corr)


%%time
check(rf2, X_train_final, y_train_final, X_valid_final, y_valid_final)

Buldozer Price Regression Analysis¶

1. Initialize¶

2. Exploratory Data Analysis¶

3. Training and Testing the models¶

4. Hyperparameter Tuning¶