Capstone Project 2
Capstone Project 2
Capstone Project 2
In [14]: df.head()
Out[14]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
1 1 85 66 29 0 26.6 0.351 31 0
3 1 89 66 23 94 28.1 0.167 21 0
In [16]: df.isnull().any()
In [17]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 768 non-null int64
2 BloodPressure 768 non-null int64
3 SkinThickness 768 non-null int64
4 Insulin 768 non-null int64
5 BMI 768 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
Out[18]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
6 3 78 50 32 88 31.0 0.248 26 1
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
In [26]: df['Glucose'].value_counts().head(5)
Out[26]: 100 17
99 17
129 14
125 14
111 14
Name: Glucose, dtype: int64
In [20]: plt.hist(df['Glucose'])
Out[20]: (array([ 5., 0., 4., 32., 156., 211., 163., 95., 56., 46.]),
array([ 0. , 19.9, 39.8, 59.7, 79.6, 99.5, 119.4, 139.3, 159.2,
179.1, 199. ]),
<BarContainer object of 10 artists>)
In [27]: df['BloodPressure'].value_counts().head()
Out[27]: 70 57
74 52
68 45
78 45
72 44
Name: BloodPressure, dtype: int64
In [22]: plt.hist(df['BloodPressure'])
Out[22]: (array([ 35., 1., 2., 13., 107., 261., 243., 87., 14., 5.]),
array([ 0. , 12.2, 24.4, 36.6, 48.8, 61. , 73.2, 85.4, 97.6,
109.8, 122. ]),
<BarContainer object of 10 artists>)
In [28]: df['SkinThickness'].value_counts().head()
Out[28]: 0 227
32 31
30 27
27 23
23 22
Name: SkinThickness, dtype: int64
In [24]: plt.hist(df['SkinThickness'])
Out[24]: (array([231., 107., 165., 175., 78., 9., 2., 0., 0., 1.]),
array([ 0. , 9.9, 19.8, 29.7, 39.6, 49.5, 59.4, 69.3, 79.2, 89.1, 99. ]),
<BarContainer object of 10 artists>)
In [29]: df['Insulin'].value_counts().head()
Out[29]: 0 374
105 11
140 9
130 9
120 8
Name: Insulin, dtype: int64
In [30]: plt.hist(df['Insulin'])
Out[30]: (array([487., 155., 70., 30., 8., 9., 5., 1., 2., 1.]),
array([ 0. , 84.6, 169.2, 253.8, 338.4, 423. , 507.6, 592.2, 676.8,
761.4, 846. ]),
<BarContainer object of 10 artists>)
In [31]: df['BMI'].value_counts().head()
Out[31]: 32.0 13
31.6 12
31.2 12
0.0 11
33.3 10
Name: BMI, dtype: int64
In [32]: plt.hist(df['BMI'])
Out[32]: (array([ 11., 0., 15., 156., 268., 224., 78., 12., 3., 1.]),
array([ 0. , 6.71, 13.42, 20.13, 26.84, 33.55, 40.26, 46.97, 53.68,
60.39, 67.1 ]),
<BarContainer object of 10 artists>)
In [33]: df.describe().transpose()
In [34]: #Week-2
In [36]: plt.hist(Positive['BMI'],histtype='stepfilled',bins=20)
Out[36]: (array([ 2., 0., 0., 0., 0., 0., 3., 13., 38., 61., 61., 36., 27.,
14., 7., 3., 1., 1., 0., 1.]),
array([ 0. , 3.355, 6.71 , 10.065, 13.42 , 16.775, 20.13 , 23.485,
26.84 , 30.195, 33.55 , 36.905, 40.26 , 43.615, 46.97 , 50.325,
53.68 , 57.035, 60.39 , 63.745, 67.1 ]),
[<matplotlib.patches.Polygon at 0x1180d540b20>])
In [37]: Positive['BMI'].value_counts().head()
Out[37]: 32.9 8
31.6 7
33.3 6
30.5 5
32.0 5
Name: BMI, dtype: int64
In [38]: plt.hist(Positive['Glucose'],histtype='stepfilled',bins=20)
Out[38]: (array([ 2., 0., 0., 0., 0., 0., 0., 1., 4., 9., 28., 26., 36.,
27., 29., 22., 24., 21., 25., 14.]),
array([ 0. , 9.95, 19.9 , 29.85, 39.8 , 49.75, 59.7 , 69.65,
79.6 , 89.55, 99.5 , 109.45, 119.4 , 129.35, 139.3 , 149.25,
159.2 , 169.15, 179.1 , 189.05, 199. ]),
[<matplotlib.patches.Polygon at 0x1180d59bfa0>])
In [39]: Positive['Glucose'].value_counts().head()
Out[39]: 125 7
158 6
128 6
115 6
129 6
Name: Glucose, dtype: int64
In [40]: plt.hist(Positive['BloodPressure'],histtype='stepfilled',bins=20)
Out[40]: (array([16., 0., 0., 0., 0., 1., 0., 1., 6., 6., 19., 37., 56.,
36., 41., 31., 7., 4., 4., 3.]),
array([ 0. , 5.7, 11.4, 17.1, 22.8, 28.5, 34.2, 39.9, 45.6,
51.3, 57. , 62.7, 68.4, 74.1, 79.8, 85.5, 91.2, 96.9,
102.6, 108.3, 114. ]),
[<matplotlib.patches.Polygon at 0x1180d600b80>])
In [41]: Positive['BloodPressure'].value_counts().head()
Out[41]: 70 23
76 18
78 17
74 17
72 16
Name: BloodPressure, dtype: int64
In [42]: plt.hist(Positive['SkinThickness'],histtype='stepfilled',bins=20)
Out[42]: (array([88., 1., 4., 10., 18., 30., 41., 34., 23., 15., 1., 1., 1.,
0., 0., 0., 0., 0., 0., 1.]),
array([ 0. , 4.95, 9.9 , 14.85, 19.8 , 24.75, 29.7 , 34.65, 39.6 ,
44.55, 49.5 , 54.45, 59.4 , 64.35, 69.3 , 74.25, 79.2 , 84.15,
89.1 , 94.05, 99. ]),
[<matplotlib.patches.Polygon at 0x1180d654e50>])
In [43]: Positive['SkinThickness'].value_counts().head()
Out[43]: 0 88
32 14
33 9
30 9
39 8
Name: SkinThickness, dtype: int64
In [44]: plt.hist(Positive['Insulin'],histtype='stepfilled',bins=20)
Out[44]: (array([141., 6., 23., 33., 24., 12., 7., 7., 2., 1., 1.,
5., 3., 1., 1., 0., 0., 0., 0., 1.]),
array([ 0. , 42.3, 84.6, 126.9, 169.2, 211.5, 253.8, 296.1, 338.4,
380.7, 423. , 465.3, 507.6, 549.9, 592.2, 634.5, 676.8, 719.1,
761.4, 803.7, 846. ]),
[<matplotlib.patches.Polygon at 0x1180d43b850>])
In [46]: Positive['Insulin'].value_counts().head()
Out[46]: 0 138
130 6
180 4
156 3
175 3
Name: Insulin, dtype: int64
In [52]: g =sns.scatterplot(x="Glucose",y="BloodPressure",
hue="Outcome",
data=df);
In [55]: df.corr()
In [56]: sns.heatmap(df.corr())
Out[56]: <AxesSubplot:>
In [57]: plt.subplots(figsize=(8,8))
sns.heatmap(df.corr(),annot=True,cmap='viridis')
Out[57]: <AxesSubplot:>
In [58]: plt.subplots(figsize=(8,8))
sns.heatmap(df.corr(),annot=True)
Out[58]: <AxesSubplot:>
In [59]: df.head(5)
Out[59]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
1 1 85 66 29 0 26.6 0.351 31 0
3 1 89 66 23 94 28.1 0.167 21 0
Out[62]: LogisticRegression()
In [63]: print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
0.7719869706840391
0.7662337662337663
AUC: 0.837
Out[66]: [<matplotlib.lines.Line2D at 0x1180f5e6340>]
Out[67]: DecisionTreeClassifier(max_depth=5)
In [68]: model3.score(X_train,y_train)
Out[68]: 0.8289902280130294
In [69]: model3.score(X_test,y_test)
Out[69]: 0.7597402597402597
Out[70]: RandomForestClassifier(n_estimators=11)
In [71]: model4.score(X_train,y_train)
Out[71]: 0.988599348534202
In [72]: model4.score(X_test,y_test)
Out[72]: 0.7467532467532467
Out[73]: SVC(gamma='auto')
In [74]: model5.score(X_test,y_test)
Out[74]: 0.6168831168831169
Out[75]: KNeighborsClassifier(n_neighbors=7)
# predict probabilities
probs = model2.predict_proba(features)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
auc = roc_auc_score(label, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(label, probs)
print("True Positive Rate - {}, False Positive Rate - {} Thresholds - {}".format(tpr,fpr,thresh
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
AUC: 0.836
True Positive Rate - [0. 0.06716418 0.23880597 0.44776119 0.60074627 0.75373134
0.88059701 0.98507463 1. ], False Positive Rate - [0. 0. 0.02 0.056 0.12 0.248
0.428 0.668 1. ] Thresholds - [2. 1. 0.85714286 0.71428571 0.57142857 0.42857
143
0.28571429 0.14285714 0. ]
Out[76]: Text(0, 0.5, 'True Positive Rate')
In [77]: from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
# predict probabilities
probs = model.predict_proba(features)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = model.predict(features)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(label, probs)
# calculate F1 score
f1 = f1_score(label, yhat)
# calculate precision-recall AUC
auc = auc(recall, precision)
# calculate average precision score
ap = average_precision_score(label, probs)
print('f1=%.3f auc=%.3f ap=%.3f' % (f1, auc, ap))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the precision-recall curve for the model
plt.plot(recall, precision, marker='.')