Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                

ML2 Practical List

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 80

Practical list

1. Check the K-means algorithm on the different datasets


such as blobs, moons, MNISET and Mall Customers.
Ans.1
#(i.)Mall customers
#Import the library
import pandas as pd
import matplotlib.pyplot as plt
# Fetch the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()

df.info()
# select features for trainig the Model
X = df.iloc[:,[3,4]].values
# find the optimum value of k in Kmeans

from sklearn.cluster import KMeans

# Within-Cluster-Sum-of-Squares(WCSS)
wcss=[]

for i in range(1,11):
kmeans= KMeans(n_clusters=i, init='k-means++',
random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
# Train the model

kmeans= KMeans(n_clusters=5, init='k-means++',


random_state=42)
y_kmeans=kmeans.fit_predict(X)
y_kmeans
plt.scatter(X[y_kmeans==0,0],
X[y_kmeans==0,1],s=100, c='red', label='Cluster1')
plt.scatter(X[y_kmeans==1,0],
X[y_kmeans==1,1],s=100, c='blue', label='Cluster2')
plt.scatter(X[y_kmeans==2,0],
X[y_kmeans==2,1],s=100, c='green', label='Cluster3')
plt.scatter(X[y_kmeans==3,0],
X[y_kmeans==3,1],s=100, c='cyan', label='Cluster4')
plt.scatter(X[y_kmeans==4,0],
X[y_kmeans==4,1],s=100, c='black', label='Cluster5')
plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')
plt.title("Cluster of Customers")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending income(1-100)")
plt.legend()
plt.show()

(ii.)make_blobs
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X,y = make_blobs(random_state=1)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print("cluster membership:\{}".format(kmeans.labels_))
kmeans.predict(X)
plt.scatter(X[kmeans==0,0], X[kmeans==0,1],s=100,
c='red', label='Cluster1')
plt.scatter(X[kmeans==1,0], X[kmeans==1,1],s=100,
c='blue', label='Cluster2')
plt.scatter(X[kmeans==2,0], X[kmeans==2,1],s=100,
c='green', label='Cluster3')

plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.show()
print("Cluster memberships: \n{}".format(y_kmeans))
print(kmeans.cluster_centers_)

(iii.)make_moons
from sklearn.datasets import make_moons
X,y =
make_moons(n_samples=200,noise=0.05,random_st
ate=0)
kmeans = KMeans(n_clusters=10,random_state=0)
kmeans.fit(X)
y_pred = kmeans.predict(X)
y_pred

plt.scatter(X[:,0],X[:,1],c=y_pred,s=60,cmap='Paired')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.clust
er_centers_[:,1],s=60,marker='^',c=range(kmeans.n_c
lusters),linewidth=2,cmap='Paired')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
print("Cluster memberships: \n{}".format(y_pred))
(iii.)MNIST
import numpy as np
from sklearn.datasets import load_digits
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")

from time import time


from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def bench_k_means(kmeans, name, data, labels):


"""Benchmark to evaluate the KMeans initialization
methods.
Parameters
----------
kmeans : KMeans instance
A :class:`~sklearn.cluster.KMeans` instance with
the initialization
already set.
name : str
Name given to the strategy. It will be used to
show the results in a
table.
data : ndarray of shape (n_samples, n_features)
The data to cluster.
labels : ndarray of shape (n_samples,)
The labels used to compute the clustering
metrics which requires some
supervision.
"""
t0 = time()
estimator = make_pipeline(StandardScaler(),
kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]
# Define the metrics which require only the true
labels and estimator
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
results += [m(labels, estimator[-1].labels_) for m in
clustering_metrics]

# The silhouette score requires the full dataset


results += [
metrics.silhouette_score(
data,
estimator[-1].labels_,
metric="euclidean",
sample_size=300,
)
]
# Show the results
formatter_result = (
"{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\
t{:.3f}"
)
print(formatter_result.format(*results))
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")

kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means+
+", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits,


n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random",
data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_,
n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-
based", data=data, labels=labels)

print(82 * "_")

import matplotlib.pyplot as plt

reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the


quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min,
y_max].
# Plot the decision boundary. For that, we will assign
a color to each
x_min, x_max = reduced_data[:, 0].min() - 1,
reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1,
reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last


trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot


Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",


markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering on the digits dataset (PCA-
reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

2. Check Mini batch K-means algorithm on the different


datasets such as blobs, moons, MNISET and Mall
Customers.
Ans.2
#(i.)make_blobs
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.datasets import make_blobs as mb
import matplotlib.pyplot as plt
import timeit as tm

import warnings
warnings.filterwarnings('ignore')
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
color = ['orange', 'purple', 'green', 'cyan']
for i in range(len(dataset)):
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)

kmeans = KMeans(n_clusters=4, random_state=0)


start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)

for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centre[i][0], centre[i][1], color = 'black')

# Mini Batch Kmeans


kmeans = MiniBatchKMeans(n_clusters=4, batch_size =
20, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)

for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')

(ii.)Mall customers
# Fetch the dataset
import pandas as pd
df = pd.read_csv('Mall_Customers.csv')
df.head()

# select features for trainig the Model


X = df.iloc[:,[3,4]].values
# find the optimum value of k in Kmeans
# Within-Cluster-Sum-of-Squares(WCSS)
wcss=[]

for i in range(1,11):
mnkmeans=
MiniBatchKMeans(n_clusters=i,batch_size = 20,
random_state=42)
mnkmeans.fit(X)
wcss.append(mnkmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

# Train the model

kmeans= KMeans(n_clusters=5, random_state=42)


start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
mnkmeans = MiniBatchKMeans(n_clusters=5,
batch_size = 20, random_state=42)
start = tm.default_timer()
mnkmeans.fit(dataset)
end = tm.default_timer()
a = mnkmeans.labels_
centers = mnkmeans.cluster_centers_
print(end-start)

# Train the model


mnkmeans= MiniBatchKMeans(n_clusters=5,
batch_size=20, random_state=42)
y_mnkmeans=mnkmeans.fit_predict(X)
plt.scatter(X[y_mnkmeans==0,0],
X[y_mnkmeans==0,1],s=100, c='red', label='Cluster1')
plt.scatter(X[y_mnkmeans==1,0],
X[y_mnkmeans==1,1],s=100, c='blue', label='Cluster2')
plt.scatter(X[y_mnkmeans==2,0],
X[y_mnkmeans==2,1],s=100, c='green', label='Cluster3')
plt.scatter(X[y_mnkmeans==3,0],
X[y_mnkmeans==3,1],s=100, c='cyan', label='Cluster4')
plt.scatter(X[y_mnkmeans==4,0],
X[y_mnkmeans==4,1],s=100, c='black', label='Cluster5')

plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')

plt.title("Cluster of Customers")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending income(1-100)")
plt.legend()
plt.show()
(iii.)make_moons
from sklearn.datasets import make_moons as mm
centre = [[0, 0], [500, 0]]
dataset, labels = mm(n_samples=500, noise=0.05)
color = ['orange', 'purple', 'green', 'cyan']
for i in range(len(dataset)):
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)
kmeans = KMeans(n_clusters=2, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)

for i in range(len(centers)):
plt.scatter(centre[i][0], centre[i][1], color = 'black')
kmeans = KMeans(n_clusters=2, random_state=0)

start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')
# Mini Batch Kmeans
kmeans = MiniBatchKMeans(n_clusters=2, batch_size =
20, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')
(iv.)MNIST
from sklearn.datasets import load_digits
import numpy as np
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def bench_k_means(kmeans, name, data, labels):

t0 = time()
estimator = make_pipeline(StandardScaler(),
kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]

# Define the metrics which require only the true labels


and estimator
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
results += [m(labels, estimator[-1].labels_) for m in
clustering_metrics]

# The silhouette score requires the full dataset


results += [
metrics.silhouette_score(
data,
estimator[-1].labels_,
metric="euclidean",
sample_size=300,
)
]

# Show the results


formatter_result = (
"{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\
t{:.3f}"
)
print(formatter_result.format(*results))
from sklearn.cluster import MiniBatchKMeans,KMeans
from sklearn.decomposition import PCA

print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")

kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++",
data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits,


n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random",
data=data, labels=labels)
pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_,
n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based",
data=data, labels=labels)

print(82 * "_")

import matplotlib.pyplot as plt

reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()

a = kmeans.labels_
centers = kmeans.cluster_centers_

print(end-start)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the


quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min,
y_max].

# Plot the decision boundary. For that, we will assign a


color to each
x_min, x_max = reduced_data[:, 0].min() - 1,
reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1,
reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained


model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot


Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",


markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering on the digits dataset (PCA-
reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

# Mini Batch Kmeans


import matplotlib.pyplot as plt
reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans =
MiniBatchKMeans(batch_size=200,n_clusters=n_digits,
n_init=4)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()

a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the


quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min,
y_max].

# Plot the decision boundary. For that, we will assign a


color to each
x_min, x_max = reduced_data[:, 0].min() - 1,
reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1,
reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained


model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot


Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",


markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"Mini batch K-means clustering on the digits dataset
(PCA-reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
3.Check Hierarchical agglomerative clustering on the
different datasets such as blobs, moons, MNISET and Mall
Customers.
Ans.3.
(i.)Mall customers
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
# Importing the dataset
df = pd.read_csv('Mall_Customers.csv')
#df.head()
# Features selection
X = df.iloc[:,[3,4]].values
#type(X)
import scipy.cluster.hierarchy as sch
dendrogram= sch.dendrogram(sch.linkage(X, 'ward'))

from sklearn.cluster import AgglomerativeClustering


hc= AgglomerativeClustering(n_clusters=5,
affinity='euclidean', linkage='ward')
y_hc= hc.fit_predict(X)
#y_hc
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c =
'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c =
'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c =
'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c =
'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c =
'magenta', label = 'Cluster 5')

plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
(ii).make_blobs
from sklearn.datasets import make_blobs as mb
import matplotlib.pyplot as plt
import timeit as tm
import warnings
warnings.filterwarnings('ignore')
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000, centers=3,
n_features=2,random_state=0)
#print(X)
#print(y)
import scipy.cluster.hierarchy as sch
dendrogram= sch.dendrogram(sch.linkage(X, 'ward'))

from sklearn.cluster import AgglomerativeClustering


hc= AgglomerativeClustering(n_clusters=3,
affinity='euclidean', linkage='ward')
y_hc= hc.fit_predict(X)
#y_hc
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c =
'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c =
'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c =
'yellow', label = 'Cluster 3')
plt.title('Clusters of make_blobs')
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend()
plt.show()
(iii.)make_moons
import numpy as np
from sklearn import cluster
import time
from itertools import cycle, islice
from sklearn.datasets import make_moons
noisy_moons=make_moons(n_samples=1500,
noise=0.05)
plot_num=1
default_base = { "n_clusters": 3}
datasets = [(noisy_moons, {"n_clusters": 2})]
from sklearn.preprocessing import StandardScaler
for i_dataset, (dataset, algo_params) in
enumerate(datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)

X, y = dataset

# normalize dataset for easier parameter selection


X = StandardScaler().fit_transform(X)

# ============
# Create cluster objects
# ============
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward"
)
complete = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="complete"
)
average = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="average"
)
single = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="single"
)

clustering_algorithms = (
("Single Linkage", single),
("Average Linkage", average),
("Complete Linkage", complete),
("Ward Linkage", ward),
)

for name, algorithm in clustering_algorithms:


t0 = time.time()

# catch warnings related to kneighbors_graph


with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected
components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the
tree early.",
category=UserWarning,
)
algorithm.fit(X)

t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)

plt.subplot(len(datasets),
len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)

colors = np.array(
list(
islice(
cycle(
[
"#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
]
),
int(max(y_pred) + 1),
)
)
)
plt.scatter(X[:, 0], X[:, 1], s=10,
color=colors[y_pred])

plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(
0.99,
0.01,
("%.2fs" % (t1 - t0)).lstrip("0"),
transform=plt.gca().transAxes,
size=15,
horizontalalignment="right",
)
plot_num +=1
plt.show()

(iv.)MNIST
from sklearn.datasets import load_digits
import numpy as np
digits = load_digits()
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
import numpy as np

from matplotlib import pyplot as plt


from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

def plot_dendrogram(model, **kwargs):


# Create linkage matrix and then plot the dendrogram

# create the counts of samples under each node


counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx -
n_samples]
counts[i] = current_count

linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)

# Plot the corresponding dendrogram


dendrogram(linkage_matrix, **kwargs)

iris = load_digits()
X = digits.data

# setting distance_threshold=0 ensures we compute the


full tree.
model = AgglomerativeClustering(distance_threshold=0,
n_clusters=None)

model = model.fit(X)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if
no parenthesis).")
plt.show()

from sklearn.cluster import AgglomerativeClustering


hc= AgglomerativeClustering(n_clusters=9,
affinity='euclidean', linkage='ward')
y_hc= hc.fit_predict(X)
#y_hc
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c =
'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c =
'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c =
'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c =
'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c =
'magenta', label = 'Cluster 5')
plt.scatter(X[y_hc == 5, 0], X[y_hc == 5, 1], s = 100, c =
'yellow', label = 'Cluster 6')
plt.scatter(X[y_hc == 6, 0], X[y_hc == 6, 1], s = 100, c =
'orange', label = 'Cluster 7')
plt.scatter(X[y_hc == 7, 0], X[y_hc == 7, 1], s = 100, c =
'purple', label = 'Cluster 8')
plt.scatter(X[y_hc == 8, 0], X[y_hc == 8, 1], s = 100, c =
'black', label = 'Cluster 9')
plt.title('Clusters of load_digits')
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend()
plt.show()
4.Check the DBSCAN algorithm on the different datasets
such as blobs, moons, MNISET and Mall Customers.
Ans.4.
(i.)Mall customers
import pandas as pd
import numpy as np
from matplotlib.pylab import plt
from sklearn.cluster import DBSCAN
df= pd.read_csv('Mall_Customers.csv')
#df.head()

X= df.iloc[:,[3,4]].values
from sklearn.neighbors import NearestNeighbors
nn= NearestNeighbors(n_neighbors=2)
nbrs=nn.fit(X)

distances, indices=nbrs.kneighbors(X)
distances = np.sort(distances, axis = 0) # sorting the distances
distances = distances[:, 1] # taking the second column of the
sorted distances
plt.rcParams['figure.figsize'] = (5,3) # setting the figure size
plt.plot(distances) # plotting the distances
plt.show() # showing the plot

dbscan= DBSCAN(eps=8, min_samples=4)


dbscan.fit(X)
lables= dbscan.labels_
#lables
plt.scatter(X[:,0], X[:,1], c=lables, cmap='plasma')
plt.xlabel('Income')
plt.ylabel('Spending score')
plt.show()
(ii.)make_blobs
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data


centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750,
centers=centers, cluster_std=0.4, random_state=0)

X = StandardScaler().fit_transform(X)

# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_,
dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.


n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)


print('Estimated number of noise points: %d' %
n_noise_)
print("Homogeneity: %0.3f" %
metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" %
metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" %
metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true,
labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))

# Plot result
import matplotlib.pyplot as plt
%matplotlib inline

# Black removed and is used for noise instead.


unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = X[class_member_mask & core_samples_mask]


plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' %
n_clusters_)
plt.show()

(iii.)make_moons
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics import v_measure_score

X, y = make_moons(n_samples=500, noise=0.1)
df=pd.DataFrame(X,y)
df=df.rename(columns={0: "X1", 1:"X2"})
#df.head()

plt.scatter(X[:, 0], X[:, 1], c=y, label=y)


plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

dbscan_cluster1 = DBSCAN()
dbscan_cluster1.fit(X)
# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster1.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# Number of Clusters
labels=dbscan_cluster1.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)

# Identify Noise
n_noise = list(dbscan_cluster1.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)

# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
dbscan_cluster = DBSCAN(eps=0.1, min_samples=8)
dbscan_cluster.fit(X)

# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# Number of Clusters
labels=dbscan_cluster.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)

# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)

# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))

from sklearn.neighbors import NearestNeighbors

nearest_neighbors= NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df)

distances, indices = neighbors.kneighbors(df)


distances = np.sort(distances[:,10], axis=0)

fig = plt.figure(figsize=(5, 5))


plt.plot(distances)
plt.xlabel("Points")
plt.ylabel("Distance")

from kneed import KneeLocator # pip install kneed

i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex',
direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")

print(distances[knee.knee])

dbscan_cluster = DBSCAN(eps=0.163, min_samples=8)


dbscan_cluster.fit(X)

# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

# Number of Clusters
labels=dbscan_cluster.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)

# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)

# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
(iv.)MNIST
import sys
import matplotlib.pyplot as plt
from time import time
from sklearn.datasets import load_digits
digits = load_digits()
fig, axes = plt.subplots(2, 5, figsize=(10,5),
subplot_kw={'xticks':(), 'yticks':()})
for ax, img in zip(axes.ravel(), digits.images):
ax.imshow(img)
# print len(digits.data) #1797
colors =
["#476A2A","#7851B8",'#BD3430','#4A2D4E','#875525',
'#A83683','#4E655E','#853541','#3A3120','#535D
8E']

t0=time()
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca = pca.fit(digits.data)
digits_pca = pca.transform(digits.data)

plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(), digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(), digits_pca[:,1].max())

for i in range(len(digits.data)):
plt.text(digits_pca[i,0], digits_pca[i,1],
str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight':'bold', 'size':9})
plt.title('PCA')
plt.xlabel("first PC")
plt.ylabel("second PC")
print ("PCA time: ", time()-t0)

plt.show()

#feature scaling

from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler = scaler.fit(digits_pca)
scaled_p = scaler.transform(digits_pca)

#PCA -> DBSCAN

t2 = time()
from sklearn.cluster import DBSCAN
import numpy as np
db = DBSCAN(eps=0.122,
min_samples=10).fit(scaled_p)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print ("number of clusters in pca-DBSCAN: ",
n_clusters_)

plt.scatter(scaled_p[:,0], scaled_p[:,1], c=labels, s=60,


edgecolors='black')
plt.title('PCA -> DBSCAN')
plt.xlabel("first PC")
plt.ylabel("second PC")
print ("DBSCAN time: ", time()-t2)
plt.show()
5.Check the voting classifier ensemble techniques on different
datasets such as titanic, iris.
Ans.5
(i.)Titanic
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
data = pd.read_csv('titanic.csv')
data.head()
data.info()

data['Embarked']=data['Embarked'].fillna(0)
data['Parch']=data['Parch'].fillna(0)
data['SibSp']=data['SibSp'].fillna(0)

data['Age']=data['Age'].fillna(data['Age'].mean())
data['Sex']=data['Sex'].fillna(0)
data['Pclass']=data['Pclass'].fillna(0)
data['Embarked']=data['Embarked'].fillna('0')
lblenc=LabelEncoder()
lblenc.fit(data['Sex'])
data['Sex']=lblenc.transform(data['Sex'])
data['Embarked']=data['Embarked'].replace('S',1)
data['Embarked']=data['Embarked'].replace('C',2)
data['Embarked']=data['Embarked'].replace('Q',3)
data.info()

#df.head()
data=data[['Pclass','Sex','Age','SibSp','Parch','Survived','Embar
ked']]
X=data[['Pclass','Sex','Age','SibSp','Parch','Embarked']]
y=data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33, random_state=44, shuffle =True)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('NN', MLPClassifier()))
from sklearn.ensemble import VotingClassifier

# Voting Classifier with hard voting


hard_voting = VotingClassifier(estimators = estimator, voting
='hard')
hard_voting.fit(X_train, y_train)
y_pred = hard_voting.predict(X_test)

from sklearn.metrics import confusion_matrix


CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)
Output:-
Confusion Matrix is :
[[255 16]
[ 41 120]]

from sklearn.metrics import accuracy_score


# accuracy_score metric to predict Accuracy
score = accuracy_score(y_test, y_pred)
print("Hard Voting Score: ",score*100)

# Voting Classifier with soft voting


soft_voting = VotingClassifier(estimators = estimator, voting
='soft')
soft_voting.fit(X_train, y_train)
y_pred = soft_voting.predict(X_test)
# Using accuracy_score
score = accuracy_score(y_test, y_pred)
print("Soft Voting Score:", score*100)
(ii.)Iris
from sklearn.datasets import load_iris

# loading iris dataset


iris = load_iris()
X = iris.data[:, :4]
y = iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size
= 0.20,random_state = 42)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
from sklearn.ensemble import VotingClassifier

# Voting Classifier with hard voting


hard_voting = VotingClassifier(estimators = estimator, voting
='hard')
hard_voting.fit(X_train, y_train)
y_pred = hard_voting.predict(X_test)
print(y_pred)
print(y_test)

from sklearn.metrics import accuracy_score


# accuracy_score metric to predict Accuracy
score = accuracy_score(y_test, y_pred)
print("Hard Voting Score: ", score)

# Voting Classifier with soft voting


soft_voting = VotingClassifier(estimators = estimator, voting
='soft')
soft_voting.fit(X_train, y_train)
y_pred = soft_voting.predict(X_test)

# Using accuracy_score
score = accuracy_score(y_test, y_pred)
print("Soft Voting Score: ", score)

You might also like