Python Solution
Python Solution
Introduction
– 1.1 Data Dictionary
– 1.2 Task
2. Preparation
– 2.1 Packages
– 2.2 Data
– 2.3 Understanding Data
3. Exploratory Data Analysis
– 3.1 Univariate Analysis
– 3.2 Bivariate Analysis
4. Data Preprocessing
– 4.1 Conclusions from the EDA
– 4.2 Packages
– 4.3 Making features model ready
5. Modeling
– 5.1 Linear Classifiers
– 5.2 Tree Models
1. Introduction
back to top
slp - Slope
1.2 Task
To perform EDA and predict if a person is prone to a heart attack or not.
2. Preparation
back to top
2.1 Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
2.2 Data
df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-
dataset/heart.csv")
pd.DataFrame(dict,index=["unique count"]).transpose()
unique count
age 41
sex 2
cp 4
trtbps 49
chol 152
fbs 2
restecg 3
thalachh 91
exng 2
oldpeak 40
slp 3
caa 5
thall 4
output 2
age 0
sex 0
cp 0
trtbps 0
chol 0
fbs 0
restecg 0
thalachh 0
exng 0
oldpeak 0
slp 0
caa 0
thall 0
output 0
dtype: int64
# Sex count
ax1.text(0.3, 220, 'Sex', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax1,data=df,x='sex',palette=color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Exng count
ax2.text(0.3, 220, 'Exng', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax2,data=df,x='exng',palette=color_palette)
ax2.set_xlabel("")
ax2.set_ylabel("")
# Caa count
ax3.text(1.5, 200, 'Caa', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax3,data=df,x='caa',palette=color_palette)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Cp count
ax4.text(1.5, 162, 'Cp', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax4,data=df,x='cp',palette=color_palette)
ax4.set_xlabel("")
ax4.set_ylabel("")
# Fbs count
ax5.text(0.5, 290, 'Fbs', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='fbs',palette=color_palette)
ax5.set_xlabel("")
ax5.set_ylabel("")
# Restecg count
ax6.text(0.75, 165, 'Restecg', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax6,data=df,x='restecg',palette=color_palette)
ax6.set_xlabel("")
ax6.set_ylabel("")
# Slp count
ax7.text(0.85, 155, 'Slp', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax7,data=df,x='slp',palette=color_palette)
ax7.set_xlabel("")
ax7.set_ylabel("")
# Thall count
ax8.text(1.2, 180, 'Thall', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax8,data=df,x='thall',palette=color_palette)
ax8.set_xlabel("")
ax8.set_ylabel("")
for s in ["top","right","left"]:
ax1.spines[s].set_visible(False)
ax2.spines[s].set_visible(False)
ax3.spines[s].set_visible(False)
ax4.spines[s].set_visible(False)
ax5.spines[s].set_visible(False)
ax6.spines[s].set_visible(False)
ax7.spines[s].set_visible(False)
ax8.spines[s].set_visible(False)
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
# Age
ax1.text(-0.05, 81, 'Age', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax1,y=df['age'],palette=["#800000"],width=0.6)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Trtbps
ax2.text(-0.05, 208, 'Trtbps', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax2,y=df['trtbps'],palette=["#8000ff"],width=0.6)
ax2.set_xlabel("")
ax2.set_ylabel("")
# Chol
ax3.text(-0.05, 600, 'Chol', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax3,y=df['chol'],palette=["#6aac90"],width=0.6)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Thalachh
ax4.text(-0.09, 210, 'Thalachh', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax4,y=df['thalachh'],palette=["#5833ff"],width=0.6)
ax4.set_xlabel("")
ax4.set_ylabel("")
# oldpeak
ax5.text(-0.1, 6.6, 'Oldpeak', fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax5,y=df['oldpeak'],palette=["#da8829"],width=0.6)
ax5.set_xlabel("")
ax5.set_ylabel("")
for s in ["top","right","left"]:
ax1.spines[s].set_visible(False)
ax2.spines[s].set_visible(False)
ax3.spines[s].set_visible(False)
ax4.spines[s].set_visible(False)
ax5.spines[s].set_visible(False)
3.1.2 Count plot of target
fig = plt.figure(figsize=(18,7))
gs = fig.add_gridspec(1,2)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
# Target Count
ax1.text(0.35,177,"Output",fontsize=14, fontweight='bold',
fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax1, data=df, x = 'output',palette = color_palette)
ax1.set_xlabel("")
ax1.set_ylabel("")
ax1.set_xticklabels(["Low chances of attack(0)","High chances of
attack(1)"])
ax0.spines["top"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["bottom"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax1.spines["top"].set_visible(False)
ax1.spines["left"].set_visible(False)
ax1.spines["right"].set_visible(False)
fig = plt.figure(figsize=(10,10))
gs = fig.add_gridspec(1,1)
gs.update(wspace=0.3, hspace=0.15)
ax0 = fig.add_subplot(gs[0,0])
color_palette = ["#5833ff","#da8829"]
mask = np.triu(np.ones_like(df_corr))
ax0.text(1.5,-0.1,"Correlation Matrix",fontsize=22, fontweight='bold',
fontfamily='serif', color="#000000")
df_corr = df[con_cols].corr().transpose()
sns.heatmap(df_corr,mask=mask,fmt=".1f",annot=True,cmap='YlGnBu')
plt.show()
3.2.2 Scatterplot heatmap of dataframe
fig = plt.figure(figsize=(12,12))
corr_mat = df.corr().stack().reset_index(name="correlation")
g = sns.relplot(
data=corr_mat,
x="level_0", y="level_1", hue="correlation", size="correlation",
palette="YlGnBu", hue_norm=(-1, 1), edgecolor=".7",
height=10, sizes=(50, 250), size_norm=(-.2, .8),
)
g.set(xlabel="features on X", ylabel="featurs on Y", aspect="equal")
g.fig.suptitle('Scatterplot heatmap',fontsize=22, fontweight='bold',
fontfamily='serif', color="#000000")
g.despine(left=True, bottom=True)
g.ax.margins(.02)
for label in g.ax.get_xticklabels():
label.set_rotation(90)
for artist in g.legend.legendHandles:
artist.set_edgecolor(".7")
plt.show()
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
ax9.set_facecolor(background_color)
# Age title
ax0.text(0.5,0.5,"Distribution of age\naccording to\n target variable\
n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax0.spines["bottom"].set_visible(False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
# Age
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax1, data=df, x='age',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax1.set_xlabel("")
ax1.set_ylabel("")
# TrTbps title
ax2.text(0.5,0.5,"Distribution of trtbps\naccording to\n target
variable\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax2.spines["bottom"].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
ax2.tick_params(left=False, bottom=False)
# TrTbps
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax3, data=df, x='trtbps',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Chol title
ax4.text(0.5,0.5,"Distribution of chol\naccording to\n target
variable\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax4.spines["bottom"].set_visible(False)
ax4.set_xticklabels([])
ax4.set_yticklabels([])
ax4.tick_params(left=False, bottom=False)
# Chol
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax5, data=df, x='chol',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax5.set_xlabel("")
ax5.set_ylabel("")
# Thalachh title
ax6.text(0.5,0.5,"Distribution of thalachh\naccording to\n target
variable\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax6.spines["bottom"].set_visible(False)
ax6.set_xticklabels([])
ax6.set_yticklabels([])
ax6.tick_params(left=False, bottom=False)
# Thalachh
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax7, data=df, x='thalachh',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax7.set_xlabel("")
ax7.set_ylabel("")
# Oldpeak title
ax8.text(0.5,0.5,"Distribution of oldpeak\naccording to\n target
variable\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax8.spines["bottom"].set_visible(False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])
ax8.tick_params(left=False, bottom=False)
# Oldpeak
ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax9, data=df, x='oldpeak',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax9.set_xlabel("")
ax9.set_ylabel("")
for i in ["top","left","right"]:
ax0.spines[i].set_visible(False)
ax1.spines[i].set_visible(False)
ax2.spines[i].set_visible(False)
ax3.spines[i].set_visible(False)
ax4.spines[i].set_visible(False)
ax5.spines[i].set_visible(False)
ax6.spines[i].set_visible(False)
ax7.spines[i].set_visible(False)
ax8.spines[i].set_visible(False)
ax9.spines[i].set_visible(False)
3.2.4 Some other relations that seemed intuitive
fig = plt.figure(figsize=(18,20))
gs = fig.add_gridspec(6,2)
gs.update(wspace=0.5, hspace=0.5)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[1,0])
ax3 = fig.add_subplot(gs[1,1])
ax4 = fig.add_subplot(gs[2,0])
ax5 = fig.add_subplot(gs[2,1])
ax6 = fig.add_subplot(gs[3,0])
ax7 = fig.add_subplot(gs[3,1])
ax8 = fig.add_subplot(gs[4,0])
ax9 = fig.add_subplot(gs[4,1])
ax10 = fig.add_subplot(gs[5,0])
ax11 = fig.add_subplot(gs[5,1])
background_color = "#ffe6e6"
color_palette = ["#800000","#8000ff","#6aac90","#5833ff","#da8829"]
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color)
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)
ax9.set_facecolor(background_color)
ax10.set_facecolor(background_color)
ax11.set_facecolor(background_color)
# Cp title
# 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 =
Asymptomatic
ax0.text(0.5,0.5,"Chest pain\ndistribution\n__________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax0.spines["bottom"].set_visible(False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.tick_params(left=False, bottom=False)
ax0.text(1,.5,"0 - Typical Angina\n1 - Atypical Angina\n2 - Non-
anginal Pain\n3 - Asymptomatic",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
# Cp
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax1, data=df, x='cp',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax1.set_xlabel("")
ax1.set_ylabel("")
# Caa title
ax2.text(0.5,0.5,"Number of\nmajor vessels\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax2.text(1,.5,"0 vessels\n1 vessel\n2 vessels\n3 vessels\n4vessels",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
ax2.spines["bottom"].set_visible(False)
ax2.set_xticklabels([])
ax2.set_yticklabels([])
ax2.tick_params(left=False, bottom=False)
# Caa
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax3, data=df, x='caa',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax3.set_xlabel("")
ax3.set_ylabel("")
# Sex title
ax4.text(0.5,0.5,"Heart Attack\naccording to\nsex\n______",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax4.text(1,.5,"0 - Female\n1 - Male",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
ax4.spines["bottom"].set_visible(False)
ax4.set_xticklabels([])
ax4.set_yticklabels([])
ax4.tick_params(left=False, bottom=False)
# Sex
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.countplot(ax=ax5,data=df,x='sex',palette=["#8000ff","#da8829"],
hue='output')
ax5.set_xlabel("")
ax5.set_ylabel("")
# Thall title
ax6.text(0.5,0.5,"Distribution of thall\naccording to\n target
variable\n___________",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax6.text(1,.5,"Thalium Stress\nTest Result\n0, 1, 2, 3",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
ax6.spines["bottom"].set_visible(False)
ax6.set_xticklabels([])
ax6.set_yticklabels([])
ax6.tick_params(left=False, bottom=False)
# Thall
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.kdeplot(ax=ax7, data=df, x='thall',hue="output",
fill=True,palette=["#8000ff","#da8829"], alpha=.5, linewidth=0)
ax7.set_xlabel("")
ax7.set_ylabel("")
# Thalachh title
ax8.text(0.5,0.5,"Boxen plot of\nthalachh wrt\noutcome\n_______",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax8.text(1,.5,"Maximum heart\nrate achieved",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
ax8.spines["bottom"].set_visible(False)
ax8.set_xticklabels([])
ax8.set_yticklabels([])
ax8.tick_params(left=False, bottom=False)
# Thalachh
ax9.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.boxenplot(ax=ax9,
data=df,x='output',y='thalachh',palette=["#8000ff","#da8829"])
ax9.set_xlabel("")
ax9.set_ylabel("")
# Exng title
ax10.text(0.5,0.5,"Strip Plot of\nexng vs age\n______",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 18,
fontweight='bold',
fontfamily='serif',
color='#000000')
ax10.text(1,.5,"Exercise induced\nangina\n0 - No\n1 - Yes",
horizontalalignment = 'center',
verticalalignment = 'center',
fontsize = 14
)
ax10.spines["bottom"].set_visible(False)
ax10.set_xticklabels([])
ax10.set_yticklabels([])
ax10.tick_params(left=False, bottom=False)
# Exng
ax11.grid(color='#000000', linestyle=':', axis='y', zorder=0,
dashes=(1,5))
sns.stripplot(ax=ax11,
data=df,x='exng',y='age',hue='output',palette=["#8000ff","#da8829"])
ax9.set_xlabel("")
ax9.set_ylabel("")
for i in ["top","left","right"]:
ax0.spines[i].set_visible(False)
ax1.spines[i].set_visible(False)
ax2.spines[i].set_visible(False)
ax3.spines[i].set_visible(False)
ax4.spines[i].set_visible(False)
ax5.spines[i].set_visible(False)
ax6.spines[i].set_visible(False)
ax7.spines[i].set_visible(False)
ax8.spines[i].set_visible(False)
ax9.spines[i].set_visible(False)
ax10.spines[i].set_visible(False)
ax11.spines[i].set_visible(False)
3.2.5 Pairplot according to target variable - one plot to rule them all
sns.pairplot(df,hue='output',palette = ["#8000ff","#da8829"])
plt.show()
4. Data Preprocessing
back to top
4.2 Packages
# Scaling
from sklearn.preprocessing import RobustScaler
# Models
import torch
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Metrics
from sklearn.metrics import accuracy_score, classification_report,
roc_curve
# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
print('Packages imported...')
Packages imported...
4.3 Making features model ready
4.3.1 Scaling and Encoding features
# creating a copy of df
df1 = df
thall_2 thall_3
0 0 0
1 1 0
2 1 0
3 1 0
4 1 0
[5 rows x 22 columns]
5. Modeling
back to top
# the scores
print("The best params are :", searcher.best_params_)
print("The best score is :", searcher.best_score_)
# predicting values
y_pred = gbt.predict(X_test)
print("The test accuracy score of Gradient Boosting Classifier is ",
accuracy_score(y_test, y_pred))
back to top