ML2 Practical List

Practical list
1. Check the K-means algorithm on the different datasets

such as blobs, moons, MNISET and Mall Customers.
Ans.1
#(i.)Mall customers
#Import the library
import pandas as pd
import matplotlib.pyplot as plt
# Fetch the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()
df.info()
# select features for trainig the Model
X = df.iloc[:,[3,4]].values
# find the optimum value of k in Kmeans
from sklearn.cluster import KMeans
# Within-Cluster-Sum-of-Squares(WCSS)
wcss=[]
for i in range(1,11):
kmeans= KMeans(n_clusters=i, init='k-means++',
random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
# Train the model
kmeans= KMeans(n_clusters=5, init='k-means++',

random_state=42)
y_kmeans=kmeans.fit_predict(X)
y_kmeans
plt.scatter(X[y_kmeans==0,0],
X[y_kmeans==0,1],s=100, c='red', label='Cluster1')
X[y_kmeans==1,1],s=100, c='blue', label='Cluster2')
X[y_kmeans==2,1],s=100, c='green', label='Cluster3')
X[y_kmeans==3,1],s=100, c='cyan', label='Cluster4')
X[y_kmeans==4,1],s=100, c='black', label='Cluster5')
plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')
plt.title("Cluster of Customers")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending income(1-100)")
plt.legend()
plt.show()
(ii.)make_blobs
from sklearn.datasets import make_blobs
X,y = make_blobs(random_state=1)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print("cluster membership:\{}".format(kmeans.labels_))
kmeans.predict(X)
plt.scatter(X[kmeans==0,0], X[kmeans==0,1],s=100,
c='red', label='Cluster1')
c='blue', label='Cluster2')
c='green', label='Cluster3')
label='Centroid')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.show()
print("Cluster memberships: \n{}".format(y_kmeans))
print(kmeans.cluster_centers_)
(iii.)make_moons
from sklearn.datasets import make_moons
X,y =
make_moons(n_samples=200,noise=0.05,random_st
ate=0)
kmeans = KMeans(n_clusters=10,random_state=0)
kmeans.fit(X)
y_pred = kmeans.predict(X)
y_pred
plt.scatter(X[:,0],X[:,1],c=y_pred,s=60,cmap='Paired')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.clust
er_centers_[:,1],s=60,marker='^',c=range(kmeans.n_c
lusters),linewidth=2,cmap='Paired')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
print("Cluster memberships: \n{}".format(y_pred))
(iii.)MNIST
import numpy as np
from sklearn.datasets import load_digits
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
from time import time

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
def bench_k_means(kmeans, name, data, labels):

"""Benchmark to evaluate the KMeans initialization
methods.
Parameters
----------
kmeans : KMeans instance
A :class:`~sklearn.cluster.KMeans` instance with
the initialization
already set.
name : str
Name given to the strategy. It will be used to
show the results in a
table.
data : ndarray of shape (n_samples, n_features)
The data to cluster.
labels : ndarray of shape (n_samples,)
The labels used to compute the clustering
metrics which requires some
supervision.
"""
t0 = time()
estimator = make_pipeline(StandardScaler(),
kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]
# Define the metrics which require only the true
labels and estimator
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
results += [m(labels, estimator[-1].labels_) for m in
clustering_metrics]
# The silhouette score requires the full dataset

results += [
metrics.silhouette_score(
data,
estimator[-1].labels_,
metric="euclidean",
sample_size=300,
)
]
# Show the results
formatter_result = (
"{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\
t{:.3f}"
)
print(formatter_result.format(*results))
from sklearn.decomposition import PCA
print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means+
+", data=data, labels=labels)
kmeans = KMeans(init="random", n_clusters=n_digits,

n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random",
data=data, labels=labels)
pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_,
n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-
based", data=data, labels=labels)
print(82 * "_")
reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans.fit(reduced_data)
# Step size of the mesh. Decrease to increase the

quality of the VQ.
h = 0.02 # point in the mesh [x_min, x_max]x[y_min,
y_max].
# Plot the decision boundary. For that, we will assign
a color to each
x_min, x_max = reduced_data[:, 0].min() - 1,
reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1,
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last

trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot

Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
Z,
interpolation="nearest",
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)
plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",

markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering on the digits dataset (PCA-
reduced data)\n"
"Centroids are marked with white cross"
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
2. Check Mini batch K-means algorithm on the different

datasets such as blobs, moons, MNISET and Mall
Customers.
Ans.2
#(i.)make_blobs
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.datasets import make_blobs as mb
import timeit as tm
import warnings
warnings.filterwarnings('ignore')
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
color = ['orange', 'purple', 'green', 'cyan']
for i in range(len(dataset)):
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)
kmeans = KMeans(n_clusters=4, random_state=0)

start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centre[i][0], centre[i][1], color = 'black')
# Mini Batch Kmeans

kmeans = MiniBatchKMeans(n_clusters=4, batch_size =
20, random_state=0)
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)
alpha = 0.3)
plt.scatter(centers[i][0], centers[i][1], color = 'black')
(ii.)Mall customers
# Fetch the dataset
import pandas as pd
df.head()
# select features for trainig the Model

# find the optimum value of k in Kmeans
# Within-Cluster-Sum-of-Squares(WCSS)
wcss=[]
for i in range(1,11):
mnkmeans=
MiniBatchKMeans(n_clusters=i,batch_size = 20,
random_state=42)
mnkmeans.fit(X)
wcss.append(mnkmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
# Train the model
kmeans= KMeans(n_clusters=5, random_state=42)

kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)
mnkmeans = MiniBatchKMeans(n_clusters=5,
batch_size = 20, random_state=42)
mnkmeans.fit(dataset)
a = mnkmeans.labels_
centers = mnkmeans.cluster_centers_
print(end-start)
# Train the model

mnkmeans= MiniBatchKMeans(n_clusters=5,
batch_size=20, random_state=42)
y_mnkmeans=mnkmeans.fit_predict(X)
plt.scatter(X[y_mnkmeans==0,0],
X[y_mnkmeans==0,1],s=100, c='red', label='Cluster1')
X[y_mnkmeans==1,1],s=100, c='blue', label='Cluster2')
X[y_mnkmeans==2,1],s=100, c='green', label='Cluster3')
X[y_mnkmeans==3,1],s=100, c='cyan', label='Cluster4')
X[y_mnkmeans==4,1],s=100, c='black', label='Cluster5')
label='Centroid')
plt.title("Cluster of Customers")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending income(1-100)")
plt.legend()
plt.show()
(iii.)make_moons
from sklearn.datasets import make_moons as mm
centre = [[0, 0], [500, 0]]
dataset, labels = mm(n_samples=500, noise=0.05)
color = ['orange', 'purple', 'green', 'cyan']
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)
alpha = 0.3)
plt.scatter(centre[i][0], centre[i][1], color = 'black')
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)
alpha = 0.3)
# Mini Batch Kmeans
kmeans = MiniBatchKMeans(n_clusters=2, batch_size =
20, random_state=0)
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)
alpha = 0.3)
(iv.)MNIST
import numpy as np
from sklearn.pipeline import make_pipeline
def bench_k_means(kmeans, name, data, labels):
t0 = time()
estimator = make_pipeline(StandardScaler(),
kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]
# Define the metrics which require only the true labels

and estimator
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
metrics.adjusted_rand_score,
metrics.adjusted_mutual_info_score,
]
results += [m(labels, estimator[-1].labels_) for m in
clustering_metrics]
# The silhouette score requires the full dataset

results += [
metrics.silhouette_score(
data,
estimator[-1].labels_,
metric="euclidean",
sample_size=300,
)
]
# Show the results

formatter_result = (
"{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\
t{:.3f}"
)
print(formatter_result.format(*results))
from sklearn.cluster import MiniBatchKMeans,KMeans
print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++",
kmeans = KMeans(init="random", n_clusters=n_digits,

n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random",
pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_,
bench_k_means(kmeans=kmeans, name="PCA-based",
print(82 * "_")
reduced_data =
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)

quality of the VQ.
y_max].
# Plot the decision boundary. For that, we will assign a

color to each
# Obtain labels for each point in mesh. Use last trained

model.

plt.figure(1)
plt.clf()
plt.imshow(
Z,
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)

markersize=2)
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"K-means clustering on the digits dataset (PCA-
reduced data)\n"
)
plt.xticks(())
plt.yticks(())
plt.show()
# Mini Batch Kmeans

reduced_data =
kmeans =
MiniBatchKMeans(batch_size=200,n_clusters=n_digits,
n_init=4)
kmeans.fit(dataset)
a = kmeans.labels_
print(end-start)

quality of the VQ.
y_max].
# Plot the decision boundary. For that, we will assign a

color to each
# Obtain labels for each point in mesh. Use last trained

model.

plt.figure(1)
plt.clf()
plt.imshow(
Z,
cmap=plt.cm.Paired,
aspect="auto",
origin="lower",
)

markersize=2)
plt.scatter(
centroids[:, 0],
centroids[:, 1],
marker="x",
s=169,
linewidths=3,
color="w",
zorder=10,
)
plt.title(
"Mini batch K-means clustering on the digits dataset
(PCA-reduced data)\n"
)
plt.xticks(())
plt.yticks(())
plt.show()
3.Check Hierarchical agglomerative clustering on the
different datasets such as blobs, moons, MNISET and Mall
Customers.
Ans.3.
(i.)Mall customers
# Importing the libraries
import pandas as pd
# Importing the dataset
#df.head()
# Features selection
#type(X)
import scipy.cluster.hierarchy as sch
dendrogram= sch.dendrogram(sch.linkage(X, 'ward'))
from sklearn.cluster import AgglomerativeClustering

hc= AgglomerativeClustering(n_clusters=5,
affinity='euclidean', linkage='ward')
y_hc= hc.fit_predict(X)
#y_hc
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c =
'red', label = 'Cluster 1')
'blue', label = 'Cluster 2')
'green', label = 'Cluster 3')
'cyan', label = 'Cluster 4')
'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
(ii).make_blobs
from sklearn.datasets import make_blobs as mb
import timeit as tm
import warnings
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
X, y = make_blobs(n_samples=1000, centers=3,
n_features=2,random_state=0)
#print(X)
#print(y)
import scipy.cluster.hierarchy as sch
dendrogram= sch.dendrogram(sch.linkage(X, 'ward'))

#y_hc
'yellow', label = 'Cluster 3')
plt.title('Clusters of make_blobs')
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend()
plt.show()
(iii.)make_moons
import numpy as np
from sklearn import cluster
import time
from itertools import cycle, islice
noisy_moons=make_moons(n_samples=1500,
noise=0.05)
plot_num=1
default_base = { "n_clusters": 3}
datasets = [(noisy_moons, {"n_clusters": 2})]
for i_dataset, (dataset, algo_params) in
enumerate(datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)
X, y = dataset
# normalize dataset for easier parameter selection

X = StandardScaler().fit_transform(X)
# ============
# Create cluster objects
# ============
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward"
)
complete = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="complete"
)
average = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="average"
)
single = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="single"
)
clustering_algorithms = (
("Single Linkage", single),
("Average Linkage", average),
("Complete Linkage", complete),
("Ward Linkage", ward),
)
for name, algorithm in clustering_algorithms:

t0 = time.time()
# catch warnings related to kneighbors_graph

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected
components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the
tree early.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
plt.subplot(len(datasets),
len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(
list(
islice(
cycle(
[
"#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
]
),
int(max(y_pred) + 1),
)
)
)
plt.scatter(X[:, 0], X[:, 1], s=10,
color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(
0.99,
0.01,
("%.2fs" % (t1 - t0)).lstrip("0"),
transform=plt.gca().transAxes,
size=15,
horizontalalignment="right",
)
plot_num +=1
plt.show()
(iv.)MNIST
import numpy as np
digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
import numpy as np
from matplotlib import pyplot as plt

from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):

# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node

counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx -
n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram

dendrogram(linkage_matrix, **kwargs)
iris = load_digits()
X = digits.data
# setting distance_threshold=0 ensures we compute the

full tree.
model = AgglomerativeClustering(distance_threshold=0,
n_clusters=None)
model = model.fit(X)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if
no parenthesis).")
plt.show()

#y_hc
'green', label = 'Cluster 3')
'cyan', label = 'Cluster 4')
'magenta', label = 'Cluster 5')
'yellow', label = 'Cluster 6')
'orange', label = 'Cluster 7')
'purple', label = 'Cluster 8')
'black', label = 'Cluster 9')
plt.title('Clusters of load_digits')
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend()
plt.show()
4.Check the DBSCAN algorithm on the different datasets
such as blobs, moons, MNISET and Mall Customers.
Ans.4.
(i.)Mall customers
import pandas as pd
import numpy as np
from matplotlib.pylab import plt
from sklearn.cluster import DBSCAN
df= pd.read_csv('Mall_Customers.csv')
#df.head()
X= df.iloc[:,[3,4]].values
from sklearn.neighbors import NearestNeighbors
nn= NearestNeighbors(n_neighbors=2)
nbrs=nn.fit(X)
distances, indices=nbrs.kneighbors(X)
distances = np.sort(distances, axis = 0) # sorting the distances
distances = distances[:, 1] # taking the second column of the
sorted distances
plt.rcParams['figure.figsize'] = (5,3) # setting the figure size
plt.plot(distances) # plotting the distances
plt.show() # showing the plot
dbscan= DBSCAN(eps=8, min_samples=4)

dbscan.fit(X)
lables= dbscan.labels_
#lables
plt.scatter(X[:,0], X[:,1], c=lables, cmap='plasma')
plt.xlabel('Income')
plt.ylabel('Spending score')
plt.show()
(ii.)make_blobs
import numpy as np
# Generate sample data

centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750,
centers=centers, cluster_std=0.4, random_state=0)
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_,
dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)

print('Estimated number of noise points: %d' %
n_noise_)
print("Homogeneity: %0.3f" %
metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" %
metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" %
metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true,
labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
# Plot result
%matplotlib inline
# Black removed and is used for noise instead.

unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]

plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' %
n_clusters_)
plt.show()
(iii.)make_moons
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import silhouette_score
from sklearn.metrics import v_measure_score
X, y = make_moons(n_samples=500, noise=0.1)
df=pd.DataFrame(X,y)
df=df.rename(columns={0: "X1", 1:"X2"})
#df.head()
plt.scatter(X[:, 0], X[:, 1], c=y, label=y)

plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
dbscan_cluster1 = DBSCAN()
dbscan_cluster1.fit(X)
# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster1.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Number of Clusters
labels=dbscan_cluster1.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)
# Identify Noise
n_noise = list(dbscan_cluster1.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)
# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
dbscan_cluster = DBSCAN(eps=0.1, min_samples=8)
dbscan_cluster.fit(X)
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
labels=dbscan_cluster.labels_
# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
from sklearn.neighbors import NearestNeighbors
nearest_neighbors= NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df)
distances, indices = neighbors.kneighbors(df)

distances = np.sort(distances[:,10], axis=0)
fig = plt.figure(figsize=(5, 5))

plt.plot(distances)
plt.xlabel("Points")
plt.ylabel("Distance")
from kneed import KneeLocator # pip install kneed
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex',
direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
print(distances[knee.knee])
dbscan_cluster = DBSCAN(eps=0.163, min_samples=8)

dbscan_cluster.fit(X)
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
labels=dbscan_cluster.labels_
# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
(iv.)MNIST
import sys
digits = load_digits()
fig, axes = plt.subplots(2, 5, figsize=(10,5),
subplot_kw={'xticks':(), 'yticks':()})
for ax, img in zip(axes.ravel(), digits.images):
ax.imshow(img)
# print len(digits.data) #1797
colors =
["#476A2A","#7851B8",'#BD3430','#4A2D4E','#875525',
'#A83683','#4E655E','#853541','#3A3120','#535D
8E']
t0=time()
pca = PCA(n_components=2)
pca = pca.fit(digits.data)
digits_pca = pca.transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(), digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(), digits_pca[:,1].max())
for i in range(len(digits.data)):
plt.text(digits_pca[i,0], digits_pca[i,1],
str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight':'bold', 'size':9})
plt.title('PCA')
plt.xlabel("first PC")
plt.ylabel("second PC")
print ("PCA time: ", time()-t0)
plt.show()
#feature scaling

scaler = StandardScaler()
scaler = scaler.fit(digits_pca)
scaled_p = scaler.transform(digits_pca)
#PCA -> DBSCAN
t2 = time()
import numpy as np
db = DBSCAN(eps=0.122,
min_samples=10).fit(scaled_p)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print ("number of clusters in pca-DBSCAN: ",
n_clusters_)
plt.scatter(scaled_p[:,0], scaled_p[:,1], c=labels, s=60,

edgecolors='black')
plt.title('PCA -> DBSCAN')
plt.xlabel("first PC")
plt.ylabel("second PC")
print ("DBSCAN time: ", time()-t2)
plt.show()
5.Check the voting classifier ensemble techniques on different
datasets such as titanic, iris.
Ans.5
(i.)Titanic
import warnings
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
data = pd.read_csv('titanic.csv')
data.head()
data.info()
data['Embarked']=data['Embarked'].fillna(0)
data['Parch']=data['Parch'].fillna(0)
data['SibSp']=data['SibSp'].fillna(0)
data['Age']=data['Age'].fillna(data['Age'].mean())
data['Sex']=data['Sex'].fillna(0)
data['Pclass']=data['Pclass'].fillna(0)
data['Embarked']=data['Embarked'].fillna('0')
lblenc=LabelEncoder()
lblenc.fit(data['Sex'])
data['Sex']=lblenc.transform(data['Sex'])
data['Embarked']=data['Embarked'].replace('S',1)
data['Embarked']=data['Embarked'].replace('C',2)
data['Embarked']=data['Embarked'].replace('Q',3)
data.info()
#df.head()
data=data[['Pclass','Sex','Age','SibSp','Parch','Survived','Embar
ked']]
X=data[['Pclass','Sex','Age','SibSp','Parch','Embarked']]
y=data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33, random_state=44, shuffle =True)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('NN', MLPClassifier()))
from sklearn.ensemble import VotingClassifier
# Voting Classifier with hard voting

hard_voting = VotingClassifier(estimators = estimator, voting
='hard')
hard_voting.fit(X_train, y_train)
y_pred = hard_voting.predict(X_test)
from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test, y_pred)
print('Confusion Matrix is : \n', CM)
Output:-
Confusion Matrix is :
[[255 16]
[ 41 120]]
from sklearn.metrics import accuracy_score

# accuracy_score metric to predict Accuracy
score = accuracy_score(y_test, y_pred)
print("Hard Voting Score: ",score*100)
# Voting Classifier with soft voting

soft_voting = VotingClassifier(estimators = estimator, voting
='soft')
soft_voting.fit(X_train, y_train)
y_pred = soft_voting.predict(X_test)
# Using accuracy_score
print("Soft Voting Score:", score*100)
(ii.)Iris
from sklearn.datasets import load_iris
# loading iris dataset

iris = load_iris()
X = iris.data[:, :4]
y = iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size
= 0.20,random_state = 42)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
from sklearn.ensemble import VotingClassifier
# Voting Classifier with hard voting

hard_voting = VotingClassifier(estimators = estimator, voting
='hard')
hard_voting.fit(X_train, y_train)
y_pred = hard_voting.predict(X_test)
print(y_pred)
print(y_test)
from sklearn.metrics import accuracy_score

# accuracy_score metric to predict Accuracy
print("Hard Voting Score: ", score)
# Voting Classifier with soft voting

soft_voting = VotingClassifier(estimators = estimator, voting
='soft')
soft_voting.fit(X_train, y_train)
y_pred = soft_voting.predict(X_test)
# Using accuracy_score
print("Soft Voting Score: ", score)

ML2 Practical List

Uploaded by

Copyright:

Available Formats

ML2 Practical List

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

ML2 Practical List

Uploaded by

Copyright:

Available Formats

Practical list

1. Check the K-means algorithm on the different datasets

from sklearn.cluster import KMeans

kmeans= KMeans(n_clusters=5, init='k-means++',

from time import time

def bench_k_means(kmeans, name, data, labels):

# The silhouette score requires the full dataset

kmeans = KMeans(init="random", n_clusters=n_digits,

import matplotlib.pyplot as plt

# Step size of the mesh. Decrease to increase the

# Obtain labels for each point in mesh. Use last

# Put the result into a color plot

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",

2. Check Mini batch K-means algorithm on the different

kmeans = KMeans(n_clusters=4, random_state=0)

# Mini Batch Kmeans

# select features for trainig the Model

# Train the model

kmeans= KMeans(n_clusters=5, random_state=42)

# Train the model

def bench_k_means(kmeans, name, data, labels):

# Define the metrics which require only the true labels

# The silhouette score requires the full dataset

# Show the results

kmeans = KMeans(init="random", n_clusters=n_digits,

import matplotlib.pyplot as plt

# Step size of the mesh. Decrease to increase the

# Plot the decision boundary. For that, we will assign a

# Obtain labels for each point in mesh. Use last trained

# Put the result into a color plot

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",

# Mini Batch Kmeans

# Step size of the mesh. Decrease to increase the

# Plot the decision boundary. For that, we will assign a

# Obtain labels for each point in mesh. Use last trained

# Put the result into a color plot

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.",

from sklearn.cluster import AgglomerativeClustering

from sklearn.cluster import AgglomerativeClustering

# normalize dataset for easier parameter selection

for name, algorithm in clustering_algorithms:

# catch warnings related to kneighbors_graph

from matplotlib import pyplot as plt

def plot_dendrogram(model, **kwargs):

# create the counts of samples under each node

# Plot the corresponding dendrogram

# setting distance_threshold=0 ensures we compute the

from sklearn.cluster import AgglomerativeClustering

dbscan= DBSCAN(eps=8, min_samples=4)

# Generate sample data

# Number of clusters in labels, ignoring noise if present.

print('Estimated number of clusters: %d' % n_clusters_)

# Black removed and is used for noise instead.

xy = X[class_member_mask & core_samples_mask]

plt.scatter(X[:, 0], X[:, 1], c=y, label=y)

from sklearn.neighbors import NearestNeighbors

distances, indices = neighbors.kneighbors(df)

fig = plt.figure(figsize=(5, 5))

from kneed import KneeLocator # pip install kneed

dbscan_cluster = DBSCAN(eps=0.163, min_samples=8)