Hello, I'm JayaPrakash and this is a part of the milestone project for my online course on Machine Learning. The dataset used is a modified version of the original one used by many to learn the various classifiers used in machine learning, "The Cleveland Heart Disease Dataset" - https://www.kaggle.com/datasets/cherngs/heart-disease-cleveland-uci.
The methodology followed is explained below:
The tools used in this project are:
The dataset contains a total of 297 entries having 14 attributes. The attributes and their descriptions are as follows:
#Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
initial_data = pd.read_csv("heart_disease.csv")
initial_data.head(10)
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | condition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 69 | 1 | 0 | 160 | 234 | 1 | 2 | 131 | 0 | 0.1 | 1 | 1 | 0 | 0 |
1 | 69 | 0 | 0 | 140 | 239 | 0 | 0 | 151 | 0 | 1.8 | 0 | 2 | 0 | 0 |
2 | 66 | 0 | 0 | 150 | 226 | 0 | 0 | 114 | 0 | 2.6 | 2 | 0 | 0 | 0 |
3 | 65 | 1 | 0 | 138 | 282 | 1 | 2 | 174 | 0 | 1.4 | 1 | 1 | 0 | 1 |
4 | 64 | 1 | 0 | 110 | 211 | 0 | 2 | 144 | 1 | 1.8 | 1 | 0 | 0 | 0 |
5 | 64 | 1 | 0 | 170 | 227 | 0 | 2 | 155 | 0 | 0.6 | 1 | 0 | 2 | 0 |
6 | 63 | 1 | 0 | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 2 | 0 | 1 | 0 |
7 | 61 | 1 | 0 | 134 | 234 | 0 | 0 | 145 | 0 | 2.6 | 1 | 2 | 0 | 1 |
8 | 60 | 0 | 0 | 150 | 240 | 0 | 0 | 171 | 0 | 0.9 | 0 | 0 | 0 | 0 |
9 | 59 | 1 | 0 | 178 | 270 | 0 | 2 | 145 | 0 | 4.2 | 2 | 0 | 2 | 0 |
initial_data.describe()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | condition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 | 297.000000 |
mean | 54.542088 | 0.676768 | 2.158249 | 131.693603 | 247.350168 | 0.144781 | 0.996633 | 149.599327 | 0.326599 | 1.055556 | 0.602694 | 0.676768 | 0.835017 | 0.461279 |
std | 9.049736 | 0.468500 | 0.964859 | 17.762806 | 51.997583 | 0.352474 | 0.994914 | 22.941562 | 0.469761 | 1.166123 | 0.618187 | 0.938965 | 0.956690 | 0.499340 |
min | 29.000000 | 0.000000 | 0.000000 | 94.000000 | 126.000000 | 0.000000 | 0.000000 | 71.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 48.000000 | 0.000000 | 2.000000 | 120.000000 | 211.000000 | 0.000000 | 0.000000 | 133.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 56.000000 | 1.000000 | 2.000000 | 130.000000 | 243.000000 | 0.000000 | 1.000000 | 153.000000 | 0.000000 | 0.800000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 61.000000 | 1.000000 | 3.000000 | 140.000000 | 276.000000 | 0.000000 | 2.000000 | 166.000000 | 1.000000 | 1.600000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 |
max | 77.000000 | 1.000000 | 3.000000 | 200.000000 | 564.000000 | 1.000000 | 2.000000 | 202.000000 | 1.000000 | 6.200000 | 2.000000 | 3.000000 | 2.000000 | 1.000000 |
initial_data.isna().sum()
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 condition 0 dtype: int64
We can visualise the various attributes of the data through the matplotlib library.
fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(initial_data["age"], color=["lightblue"])
ax.set_xlabel("Age")
ax.set_ylabel("Number of Patients")
ax.set_title("Age Distribution")
plt.show()
initial_data["condition"].value_counts().plot(kind="bar",
color=["lightgreen","salmon"],
xlabel="Condition",
ylabel="Number of Patients")
plt.xticks([0, 1], ["No Disease", "Disease"], rotation=0)
plt.title("Condition of the Patients");
pd.crosstab(initial_data.sex, initial_data.condition).plot(kind="bar", color=["lightgreen", "salmon"], figsize=(10,6))
plt.xticks([0, 1], ["Female", "Male"], rotation=0)
plt.legend(["No Disease", "Disease"])
plt.title("Sex distribution categorized for Heart Disease");
plt.figure(figsize=(10, 6))
plt.scatter(initial_data.age[initial_data.condition==0],
initial_data.chol[initial_data.condition==0],
color="lightgreen");
plt.scatter(initial_data.age[initial_data.condition==1],
initial_data.chol[initial_data.condition==1],
color="salmon");
plt.title("Age vs Cholesterol impact on Heart Disease")
plt.xlabel("Age")
plt.ylabel("Cholesterol")
plt.legend(["No Disease", "Disease"]);
plt.figure(figsize=(10, 6))
plt.scatter(initial_data.age[initial_data.condition==0],
initial_data.thalach[initial_data.condition==0],
color="lightgreen");
plt.scatter(initial_data.age[initial_data.condition==1],
initial_data.thalach[initial_data.condition==1],
color="salmon");
plt.title("Age vs Thalach impact on Heart Disease")
plt.xlabel("Age")
plt.ylabel("Thalach")
plt.legend(["No Disease", "Disease"]);
pd.crosstab(initial_data.cp, initial_data.condition).plot(kind="bar",
color=["lightgreen", "salmon"],
figsize=(10,6),
xlabel="Levels of Chest Pain")
plt.xticks([0, 1, 2, 3], ["Typical angina", "Atypical angina", "Non-anginal pain", "Asymptomatic"], rotation=0)
plt.legend(["No Disease", "Disease"])
plt.title("Heart Disease Factored by Chest Pain");
#Correlation Heat Map
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(initial_data.corr(),
annot = True,
linewidths=0.5,
fmt=".2f",
cmap="YlGnBu")
Since the number of potential features is low, we can avoid reducing it further. Usually we would need to reduce the feature list to hasten the model training and testing phase.
from sklearn.model_selection import train_test_split
np.random.seed(7)
X = initial_data.drop("condition", axis=1)
y = initial_data["condition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Based on the Scikit learn criteria for selecting a model, https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html, we opt to the following models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
trial_models = {"Logistic Regression": LogisticRegression(),
"KNN": KNeighborsClassifier(),
"Random Forest": RandomForestClassifier()}
trial_models_score = {}
def clf(model, X_train, X_test, y_train, y_test):
np.random.seed(7)
for name, model in trial_models.items():
model.fit(X_train, y_train)
trial_models_score[name] = model.score(X_test, y_test)
return trial_models_score
trial_scores = clf(model=trial_models,
X_train=X_train,
X_test=X_test,
y_train=y_train,
y_test=y_test)
trial_scores
E:\Jaya Prakash\Data Science\milestone_project1\env\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
{'Logistic Regression': 0.8833333333333333, 'KNN': 0.7333333333333333, 'Random Forest': 0.85}
scores = pd.DataFrame(trial_models_score, index=["Accuracy"])
scores.T.plot(kind="bar", color="lightgreen")
plt.xticks(rotation=0);
from sklearn.model_selection import GridSearchCV
lr_param = {"solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
"max_iter": [100, 150, 200, 250, 300]}
rf_param = {"n_estimators": [100, 150, 200],
"max_depth": [2, 4, 6],
"min_samples_leaf": [1, 2, 3, 4]}
np.random.seed(7)
clf1 = GridSearchCV(estimator=LogisticRegression(),
param_grid=lr_param,
verbose=2)
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)
0.8833333333333333
np.random.seed(7)
clf2 = GridSearchCV(estimator=RandomForestClassifier(),
param_grid=rf_param,
verbose=3,
cv=5)
clf2.fit(X_train, y_train)
clf2.score(X_test, y_test)
0.9166666666666666
Now that we have a prototype model, we need to evaluate its predicting capacity. And for that we use the following metrics:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import cross_val_score
predictions = clf2.predict(X_test)
predictions
array([1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)
np.array(y_test)
array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)
#ROC Curve and AUC value
RocCurveDisplay.from_estimator(clf2, X_test, y_test)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1fb7fb28a60>
#Confusion Matrix
sns.set(font_scale=1.5)
fig, ax=plt.subplots(figsize=(3,3))
ax = sns.heatmap(confusion_matrix(y_test, predictions),
annot=True,
cbar=False)
plt.xlabel="True Label"
plt.ylabel="Predicted Label"
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.94 0.91 0.92 33 1 0.89 0.93 0.91 27 accuracy 0.92 60 macro avg 0.92 0.92 0.92 60 weighted avg 0.92 0.92 0.92 60
clf2.best_params_
{'max_depth': 2, 'min_samples_leaf': 1, 'n_estimators': 200}
clf_final = RandomForestClassifier(max_depth=2,
min_samples_leaf=1,
n_estimators=200)
np.random.seed(7)
cv_accuracy = cross_val_score(clf_final,
X,
y,
cv=10,
scoring="accuracy")
cv_accuracy.mean()
0.8211494252873563
def report(clf, X, y, cv):
np.random.seed(7)
parameters = ["accuracy", "precision", "recall", "f1"]
score_report = {}
for val in parameters:
score = cross_val_score(clf, X, y, cv=cv, scoring=val)
score_report[val]=np.mean(score)
return score_report
score = report(clf_final, X, y, cv=10)
rep = pd.DataFrame(score, index=[0])
rep.T.plot.bar(legend=False);
clf2.best_score_
0.83572695035461
clf2.best_estimator_
RandomForestClassifier(max_depth=2, n_estimators=200)
import joblib
joblib.dump(clf2.best_estimator_, "heart_disease_RFC.pkl", compress=1)
['heart_disease_RFC.pkl']