Program 2 Hierarchical Cluestring
Program 2 Hierarchical Cluestring
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("Country-data.csv")
df
country child_mort exports health imports income inflation life_expec total_fer gdpp
0 Afghanistan 90.2 10.0 7.58 44.9 1610 9.44 56.2 5.82 553
1 Albania 16.6 28.0 6.55 48.6 9930 4.49 76.3 1.65 4090
2 Algeria 27.3 38.4 4.17 31.4 12900 16.10 76.5 2.89 4460
3 Angola 119.0 62.3 2.85 42.9 5900 22.40 60.1 6.16 3530
4 Antigua and Barbuda 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200
... ... ... ... ... ... ... ... ... ... ...
162 Vanuatu 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970
163 Venezuela 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500
164 Vietnam 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310
165 Yemen 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310
166 Zambia 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460
df.isnull().sum()
country 0
child_mort 0
exports 0
health 0
imports 0
income 0
inflation 0
life_expec 0
total_fer 0
gdpp 0
dtype: int64
df.dtypes
country object
child_mort float64
exports float64
health float64
imports float64
income int64
inflation float64
life_expec float64
total_fer float64
gdpp int64
dtype: object
df_new = df.drop(['country'],axis = 1)
df_new
... ... ... ... ... ... ... ... ... ...
162 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970
163 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500
164 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310
165 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310
166 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460
plt.figure(figsize = (12,8))
plt.figure(figsize = (12,8))
feature_list = df_new.columns
for i in range(len(feature_list)):
plt.subplot(3, 3, i + 1)
sns.boxplot(y = df_new[feature_list[i]], data = df_new)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
#Define a function which returns the Upper and Lower limit to detect outliers for each feature
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
plt.figure(figsize = (12,8))
feature_list = df_new.columns
for i in range(len(feature_list)):
plt.subplot(3, 3, i + 1)
sns.boxplot(y = df_new[feature_list[i]], data = df_new)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
from scipy.cluster.hierarchy import dendrogram, linkage
dend = dendrogram(wardlink)
array([4, 5, 5, 5, 3, 3, 5, 1, 1, 5, 2, 1, 4, 3, 5, 1, 5, 4, 5, 5, 5, 5,
3, 1, 5, 4, 4, 4, 4, 1, 5, 4, 4, 3, 5, 5, 4, 4, 5, 5, 4, 3, 2, 2,
1, 5, 5, 5, 5, 2, 4, 3, 5, 1, 2, 5, 4, 5, 1, 4, 2, 5, 5, 4, 4, 5,
4, 3, 1, 4, 5, 5, 5, 1, 2, 2, 5, 2, 5, 3, 4, 4, 1, 4, 4, 3, 5, 4,
4, 2, 3, 1, 5, 4, 4, 3, 5, 4, 2, 4, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4,
1, 2, 4, 5, 1, 1, 4, 5, 5, 5, 5, 3, 2, 1, 5, 3, 4, 5, 1, 4, 5, 3,
4, 1, 3, 2, 4, 5, 2, 2, 5, 5, 4, 5, 1, 1, 4, 4, 5, 4, 4, 5, 5, 3,
5, 4, 5, 1, 2, 1, 3, 4, 4, 3, 4, 4, 4], dtype=int32)
array([ 79, 117, 156, 137, 53, 52, 129, 10, 18, 144, 36, 9, 60,
40, 145, 11, 110, 69, 130, 131, 113, 160, 39, 4, 146, 86,
77, 61, 56, 12, 138, 72, 71, 54, 114, 152, 83, 75, 133,
162, 57, 55, 31, 21, 20, 153, 115, 109, 123, 26, 84, 43,
125, 14, 30, 142, 80, 126, 13, 97, 34, 149, 127, 91, 87,
139, 81, 41, 15, 101, 107, 147, 157, 16, 35, 28, 111, 29,
120, 45, 62, 64, 4, 63, 104, 50, 143, 58, 76, 27, 47,
1, 155, 85, 74, 46, 150, 70, 22, 98, 140, 92, 106, 121,
161, 128, 73, 105, 112, 68, 17, 33, 78, 134, 5, 7, 103,
141, 124, 116, 132, 42, 23, 4, 148, 49, 88, 135, 8, 59,
158, 48, 89, 1, 44, 24, 65, 151, 25, 32, 108, 154, 95,
163, 19, 2, 66, 67, 159, 94, 90, 136, 119, 51, 118, 82,
122, 3, 28, 6, 37, 102, 93, 38, 99, 100, 96], dtype=int32)
df["clusters"] = clusters
df
country child_mort exports health imports income inflation life_expec total_fer gdpp clusters
0 Afghanistan 90.2 10.0 7.58 44.9 1610 9.44 56.2 5.82 553 4
1 Albania 16.6 28.0 6.55 48.6 9930 4.49 76.3 1.65 4090 5
2 Algeria 27.3 38.4 4.17 31.4 12900 16.10 76.5 2.89 4460 5
3 Angola 119.0 62.3 2.85 42.9 5900 22.40 60.1 6.16 3530 5
4 Antigua and Barbuda 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200 3
... ... ... ... ... ... ... ... ... ... ... ...
162 Vanuatu 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970 4
163 Venezuela 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500 3
164 Vietnam 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310 4
165 Yemen 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310 4
166 Zambia 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460 4
df.to_csv("country_data_clust.csv")
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js