Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
11 views

Program 2 Hierarchical Cluestring

BA

Uploaded by

9738978362.mj
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views

Program 2 Hierarchical Cluestring

BA

Uploaded by

9738978362.mj
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 5

import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("Country-data.csv")

df

country child_mort exports health imports income inflation life_expec total_fer gdpp

0 Afghanistan 90.2 10.0 7.58 44.9 1610 9.44 56.2 5.82 553

1 Albania 16.6 28.0 6.55 48.6 9930 4.49 76.3 1.65 4090

2 Algeria 27.3 38.4 4.17 31.4 12900 16.10 76.5 2.89 4460

3 Angola 119.0 62.3 2.85 42.9 5900 22.40 60.1 6.16 3530

4 Antigua and Barbuda 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200

... ... ... ... ... ... ... ... ... ... ...

162 Vanuatu 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970

163 Venezuela 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500

164 Vietnam 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310

165 Yemen 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310

166 Zambia 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460

167 rows × 10 columns

df.isnull().sum()

country 0
child_mort 0
exports 0
health 0
imports 0
income 0
inflation 0
life_expec 0
total_fer 0
gdpp 0
dtype: int64

df.dtypes

country object
child_mort float64
exports float64
health float64
imports float64
income int64
inflation float64
life_expec float64
total_fer float64
gdpp int64
dtype: object

df_new = df.drop(['country'],axis = 1)
df_new

child_mort exports health imports income inflation life_expec total_fer gdpp

0 90.2 10.0 7.58 44.9 1610 9.44 56.2 5.82 553

1 16.6 28.0 6.55 48.6 9930 4.49 76.3 1.65 4090

2 27.3 38.4 4.17 31.4 12900 16.10 76.5 2.89 4460

3 119.0 62.3 2.85 42.9 5900 22.40 60.1 6.16 3530

4 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200

... ... ... ... ... ... ... ... ... ...

162 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970

163 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500

164 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310

165 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310

166 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460

167 rows × 9 columns

plt.figure(figsize = (12,8))
plt.figure(figsize = (12,8))
feature_list = df_new.columns
for i in range(len(feature_list)):
plt.subplot(3, 3, i + 1)
sns.boxplot(y = df_new[feature_list[i]], data = df_new)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()

#Define a function which returns the Upper and Lower limit to detect outliers for each feature
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range

#Cap & floor the values beyond the outlier boundaries


for i in feature_list:
LL, UL = remove_outlier(df_new[i])
df_new[i] = np.where(df_new[i] > UL, UL, df_new[i])
df_new[i] = np.where(df_new[i] < LL, LL, df_new[i])

plt.figure(figsize = (12,8))
feature_list = df_new.columns
for i in range(len(feature_list)):
plt.subplot(3, 3, i + 1)
sns.boxplot(y = df_new[feature_list[i]], data = df_new)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
from scipy.cluster.hierarchy import dendrogram, linkage

wardlink = linkage(df_new, method="ward")

dend = dendrogram(wardlink)

#To merge last 10


dend = dendrogram(wardlink,
truncate_mode = "lastp",
p=10)
from scipy.cluster.hierarchy import fcluster

#Method 1 if you want entire records in number of clusters


clusters = fcluster(wardlink, 5, criterion = 'maxclust') #5 is number of clusters
clusters

array([4, 5, 5, 5, 3, 3, 5, 1, 1, 5, 2, 1, 4, 3, 5, 1, 5, 4, 5, 5, 5, 5,
3, 1, 5, 4, 4, 4, 4, 1, 5, 4, 4, 3, 5, 5, 4, 4, 5, 5, 4, 3, 2, 2,
1, 5, 5, 5, 5, 2, 4, 3, 5, 1, 2, 5, 4, 5, 1, 4, 2, 5, 5, 4, 4, 5,
4, 3, 1, 4, 5, 5, 5, 1, 2, 2, 5, 2, 5, 3, 4, 4, 1, 4, 4, 3, 5, 4,
4, 2, 3, 1, 5, 4, 4, 3, 5, 4, 2, 4, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4,
1, 2, 4, 5, 1, 1, 4, 5, 5, 5, 5, 3, 2, 1, 5, 3, 4, 5, 1, 4, 5, 3,
4, 1, 3, 2, 4, 5, 2, 2, 5, 5, 4, 5, 1, 1, 4, 4, 5, 4, 4, 5, 5, 3,
5, 4, 5, 1, 2, 1, 3, 4, 4, 3, 4, 4, 4], dtype=int32)

#Method 2(based on distance)

clusters = fcluster(wardlink, 23, criterion="distance" )


clusters

array([ 79, 117, 156, 137, 53, 52, 129, 10, 18, 144, 36, 9, 60,
40, 145, 11, 110, 69, 130, 131, 113, 160, 39, 4, 146, 86,
77, 61, 56, 12, 138, 72, 71, 54, 114, 152, 83, 75, 133,
162, 57, 55, 31, 21, 20, 153, 115, 109, 123, 26, 84, 43,
125, 14, 30, 142, 80, 126, 13, 97, 34, 149, 127, 91, 87,
139, 81, 41, 15, 101, 107, 147, 157, 16, 35, 28, 111, 29,
120, 45, 62, 64, 4, 63, 104, 50, 143, 58, 76, 27, 47,
1, 155, 85, 74, 46, 150, 70, 22, 98, 140, 92, 106, 121,
161, 128, 73, 105, 112, 68, 17, 33, 78, 134, 5, 7, 103,
141, 124, 116, 132, 42, 23, 4, 148, 49, 88, 135, 8, 59,
158, 48, 89, 1, 44, 24, 65, 151, 25, 32, 108, 154, 95,
163, 19, 2, 66, 67, 159, 94, 90, 136, 119, 51, 118, 82,
122, 3, 28, 6, 37, 102, 93, 38, 99, 100, 96], dtype=int32)

df["clusters"] = clusters
df

country child_mort exports health imports income inflation life_expec total_fer gdpp clusters

0 Afghanistan 90.2 10.0 7.58 44.9 1610 9.44 56.2 5.82 553 4

1 Albania 16.6 28.0 6.55 48.6 9930 4.49 76.3 1.65 4090 5

2 Algeria 27.3 38.4 4.17 31.4 12900 16.10 76.5 2.89 4460 5

3 Angola 119.0 62.3 2.85 42.9 5900 22.40 60.1 6.16 3530 5

4 Antigua and Barbuda 10.3 45.5 6.03 58.9 19100 1.44 76.8 2.13 12200 3

... ... ... ... ... ... ... ... ... ... ... ...

162 Vanuatu 29.2 46.6 5.25 52.7 2950 2.62 63.0 3.50 2970 4

163 Venezuela 17.1 28.5 4.91 17.6 16500 45.90 75.4 2.47 13500 3

164 Vietnam 23.3 72.0 6.84 80.2 4490 12.10 73.1 1.95 1310 4

165 Yemen 56.3 30.0 5.18 34.4 4480 23.60 67.5 4.67 1310 4

166 Zambia 83.1 37.0 5.89 30.9 3280 14.00 52.0 5.40 1460 4

167 rows × 11 columns

df.to_csv("country_data_clust.csv")
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js

You might also like