#Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
initial_data = pd.read_csv("heart_disease.csv")


initial_data.head(10)


initial_data.describe()


initial_data.isna().sum()

age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
condition    0
dtype: int64


fig, ax = plt.subplots(figsize=(10, 6))
ax.hist(initial_data["age"], color=["lightblue"])
ax.set_xlabel("Age")
ax.set_ylabel("Number of Patients")
ax.set_title("Age Distribution")
plt.show()


initial_data["condition"].value_counts().plot(kind="bar", 
                                              color=["lightgreen","salmon"], 
                                              xlabel="Condition",
                                              ylabel="Number of Patients")
plt.xticks([0, 1], ["No Disease", "Disease"], rotation=0)
plt.title("Condition of the Patients");


pd.crosstab(initial_data.sex, initial_data.condition).plot(kind="bar", color=["lightgreen", "salmon"], figsize=(10,6))
plt.xticks([0, 1], ["Female", "Male"], rotation=0)
plt.legend(["No Disease", "Disease"])
plt.title("Sex distribution categorized for Heart Disease");


plt.figure(figsize=(10, 6))
plt.scatter(initial_data.age[initial_data.condition==0], 
            initial_data.chol[initial_data.condition==0],
            color="lightgreen");
plt.scatter(initial_data.age[initial_data.condition==1], 
            initial_data.chol[initial_data.condition==1],
            color="salmon");
plt.title("Age vs Cholesterol impact on Heart Disease")
plt.xlabel("Age")
plt.ylabel("Cholesterol")
plt.legend(["No Disease", "Disease"]);


plt.figure(figsize=(10, 6))
plt.scatter(initial_data.age[initial_data.condition==0], 
            initial_data.thalach[initial_data.condition==0],
            color="lightgreen");
plt.scatter(initial_data.age[initial_data.condition==1], 
            initial_data.thalach[initial_data.condition==1],
            color="salmon");
plt.title("Age vs Thalach impact on Heart Disease")
plt.xlabel("Age")
plt.ylabel("Thalach")
plt.legend(["No Disease", "Disease"]);


pd.crosstab(initial_data.cp, initial_data.condition).plot(kind="bar", 
                                                          color=["lightgreen", "salmon"], 
                                                          figsize=(10,6),
                                                          xlabel="Levels of Chest Pain")
plt.xticks([0, 1, 2, 3], ["Typical angina", "Atypical angina", "Non-anginal pain", "Asymptomatic"], rotation=0)
plt.legend(["No Disease", "Disease"])
plt.title("Heart Disease Factored by Chest Pain");


#Correlation Heat Map
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(initial_data.corr(),
                 annot = True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu")


from sklearn.model_selection import train_test_split

np.random.seed(7)
X = initial_data.drop("condition", axis=1)
y = initial_data["condition"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

trial_models = {"Logistic Regression": LogisticRegression(),
         "KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier()}
trial_models_score = {}

def clf(model, X_train, X_test, y_train, y_test):
    np.random.seed(7)
    for name, model in trial_models.items():
        model.fit(X_train, y_train)
        trial_models_score[name] = model.score(X_test, y_test)
    return trial_models_score


trial_scores = clf(model=trial_models, 
                   X_train=X_train, 
                   X_test=X_test, 
                   y_train=y_train, 
                   y_test=y_test)
trial_scores

E:\Jaya Prakash\Data Science\milestone_project1\env\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

{'Logistic Regression': 0.8833333333333333,
 'KNN': 0.7333333333333333,
 'Random Forest': 0.85}


scores = pd.DataFrame(trial_models_score, index=["Accuracy"])
scores.T.plot(kind="bar", color="lightgreen")
plt.xticks(rotation=0);


from sklearn.model_selection import GridSearchCV
lr_param = {"solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
            "max_iter": [100, 150, 200, 250, 300]}
rf_param = {"n_estimators": [100, 150, 200],
            "max_depth": [2, 4, 6],
            "min_samples_leaf": [1, 2, 3, 4]}


np.random.seed(7)
clf1 = GridSearchCV(estimator=LogisticRegression(),
                    param_grid=lr_param,
                    verbose=2)
clf1.fit(X_train, y_train)


clf1.score(X_test, y_test)

0.8833333333333333


np.random.seed(7)
clf2 = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=rf_param,
                    verbose=3,
                    cv=5)
clf2.fit(X_train, y_train)


clf2.score(X_test, y_test)

0.9166666666666666


from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import cross_val_score


predictions = clf2.predict(X_test)
predictions

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)


np.array(y_test)

array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int64)


#ROC Curve and AUC value
RocCurveDisplay.from_estimator(clf2, X_test, y_test)

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1fb7fb28a60>


#Confusion Matrix
sns.set(font_scale=1.5)
fig, ax=plt.subplots(figsize=(3,3))
ax = sns.heatmap(confusion_matrix(y_test, predictions),
                 annot=True,
                 cbar=False)
plt.xlabel="True Label"
plt.ylabel="Predicted Label"


print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92        33
           1       0.89      0.93      0.91        27

    accuracy                           0.92        60
   macro avg       0.92      0.92      0.92        60
weighted avg       0.92      0.92      0.92        60


clf2.best_params_

{'max_depth': 2, 'min_samples_leaf': 1, 'n_estimators': 200}


clf_final = RandomForestClassifier(max_depth=2,
                                   min_samples_leaf=1,
                                   n_estimators=200)


np.random.seed(7)
cv_accuracy = cross_val_score(clf_final,
                              X,
                              y,
                              cv=10,
                              scoring="accuracy")
cv_accuracy.mean()

0.8211494252873563


def report(clf, X, y, cv):
    np.random.seed(7)
    parameters = ["accuracy", "precision", "recall", "f1"]
    score_report = {}
    for val in parameters:
        score = cross_val_score(clf, X, y, cv=cv, scoring=val)
        score_report[val]=np.mean(score)
    return score_report


score = report(clf_final, X, y, cv=10)


rep = pd.DataFrame(score, index=[0])
rep.T.plot.bar(legend=False);


clf2.best_score_

0.83572695035461


clf2.best_estimator_

RandomForestClassifier(max_depth=2, n_estimators=200)


import joblib
joblib.dump(clf2.best_estimator_, "heart_disease_RFC.pkl", compress=1)

['heart_disease_RFC.pkl']

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	condition
0	69	1	160	234	1	2	131	0	0.1	1	1	0	0
1	69	0	140	239	0	0	151	0	1.8	0	2	0	0
2	66	0	150	226	0	0	114	0	2.6	2	0	0	0
3	65	1	138	282	1	2	174	0	1.4	1	1	0	1
4	64	1	110	211	0	2	144	1	1.8	1	0	0	0
5	64	1	170	227	0	2	155	0	0.6	1	0	2	0
6	63	1	145	233	1	2	150	0	2.3	2	0	1	0
7	61	1	134	234	0	0	145	0	2.6	1	2	0	1
8	60	0	150	240	0	0	171	0	0.9	0	0	0	0
9	59	1	178	270	0	2	145	0	4.2	2	0	2	0

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	condition
count	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000	297.000000
mean	54.542088	0.676768	2.158249	131.693603	247.350168	0.144781	0.996633	149.599327	0.326599	1.055556	0.602694	0.676768	0.835017	0.461279
std	9.049736	0.468500	0.964859	17.762806	51.997583	0.352474	0.994914	22.941562	0.469761	1.166123	0.618187	0.938965	0.956690	0.499340
min	29.000000	0.000000	0.000000	94.000000	126.000000	0.000000	0.000000	71.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	48.000000	0.000000	2.000000	120.000000	211.000000	0.000000	0.000000	133.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	56.000000	1.000000	2.000000	130.000000	243.000000	0.000000	1.000000	153.000000	0.000000	0.800000	1.000000	0.000000	0.000000	0.000000
75%	61.000000	1.000000	3.000000	140.000000	276.000000	0.000000	2.000000	166.000000	1.000000	1.600000	1.000000	1.000000	2.000000	1.000000
max	77.000000	1.000000	3.000000	200.000000	564.000000	1.000000	2.000000	202.000000	1.000000	6.200000	2.000000	3.000000	2.000000	1.000000

Milestone Project 1: Cleveland Heart Disease Dataset¶

1. Data Description¶

The evaluation condition for the classifier will be set at 95% accuracy in predicting whether a given patient has heart disease based on the features above.¶

Since there is no missing data, and all the columns are numerical values, we can directly start assuming classifiers.¶

Usually we must clean the data for missing values and encode the categorical data before we can fit the model/classifier¶

2. Exploratory Data Analysis¶

3. Selecting Features for the Classifier¶

4. Selecting a Classifier¶

Tuning the hyperparameters of Logistic Regression and Random Forest Model¶

5. Evaluating the Classifier¶

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	condition
0	69	1	160	234	1	2	131	0	0.1	1	1	0	0
1	69	0	140	239	0	0	151	0	1.8	0	2	0	0
2	66	0	150	226	0	0	114	0	2.6	2	0	0	0
3	65	1	138	282	1	2	174	0	1.4	1	1	0	1
4	64	1	110	211	0	2	144	1	1.8	1	0	0	0
5	64	1	170	227	0	2	155	0	0.6	1	0	2	0
6	63	1	145	233	1	2	150	0	2.3	2	0	1	0
7	61	1	134	234	0	0	145	0	2.6	1	2	0	1
8	60	0	150	240	0	0	171	0	0.9	0	0	0	0
9	59	1	178	270	0	2	145	0	4.2	2	0	2	0

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	condition
0	69	1	160	234	1	2	131	0	0.1	1	1	0	0
1	69	0	140	239	0	0	151	0	1.8	0	2	0	0
2	66	0	150	226	0	0	114	0	2.6	2	0	0	0
3	65	1	138	282	1	2	174	0	1.4	1	1	0	1
4	64	1	110	211	0	2	144	1	1.8	1	0	0	0
5	64	1	170	227	0	2	155	0	0.6	1	0	2	0
6	63	1	145	233	1	2	150	0	2.3	2	0	1	0
7	61	1	134	234	0	0	145	0	2.6	1	2	0	1
8	60	0	150	240	0	0	171	0	0.9	0	0	0	0
9	59	1	178	270	0	2	145	0	4.2	2	0	2	0

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	ca	thal	condition
0	69	1	160	234	1	2	131	0	0.1	1	1	0	0
1	69	0	140	239	0	0	151	0	1.8	0	2	0	0
2	66	0	150	226	0	0	114	0	2.6	2	0	0	0
3	65	1	138	282	1	2	174	0	1.4	1	1	0	1
4	64	1	110	211	0	2	144	1	1.8	1	0	0	0
5	64	1	170	227	0	2	155	0	0.6	1	0	2	0
6	63	1	145	233	1	2	150	0	2.3	2	0	1	0
7	61	1	134	234	0	0	145	0	2.6	1	2	0	1
8	60	0	150	240	0	0	171	0	0.9	0	0	0	0
9	59	1	178	270	0	2	145	0	4.2	2	0	2	0