ML2 Practical List
ML2 Practical List
ML2 Practical List
df.info()
# select features for trainig the Model
X = df.iloc[:,[3,4]].values
# find the optimum value of k in Kmeans
# Within-Cluster-Sum-of-Squares(WCSS)
wcss=[]
for i in range(1,11):
kmeans= KMeans(n_clusters=i, init='k-means++',
random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
# Train the model
(ii.)make_blobs
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X,y = make_blobs(random_state=1)
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print("cluster membership:\{}".format(kmeans.labels_))
kmeans.predict(X)
plt.scatter(X[kmeans==0,0], X[kmeans==0,1],s=100,
c='red', label='Cluster1')
plt.scatter(X[kmeans==1,0], X[kmeans==1,1],s=100,
c='blue', label='Cluster2')
plt.scatter(X[kmeans==2,0], X[kmeans==2,1],s=100,
c='green', label='Cluster3')
plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.show()
print("Cluster memberships: \n{}".format(y_kmeans))
print(kmeans.cluster_centers_)
(iii.)make_moons
from sklearn.datasets import make_moons
X,y =
make_moons(n_samples=200,noise=0.05,random_st
ate=0)
kmeans = KMeans(n_clusters=10,random_state=0)
kmeans.fit(X)
y_pred = kmeans.predict(X)
y_pred
plt.scatter(X[:,0],X[:,1],c=y_pred,s=60,cmap='Paired')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.clust
er_centers_[:,1],s=60,marker='^',c=range(kmeans.n_c
lusters),linewidth=2,cmap='Paired')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
print("Cluster memberships: \n{}".format(y_pred))
(iii.)MNIST
import numpy as np
from sklearn.datasets import load_digits
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means+
+", data=data, labels=labels)
pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_,
n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-
based", data=data, labels=labels)
print(82 * "_")
reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)
import warnings
warnings.filterwarnings('ignore')
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
color = ['orange', 'purple', 'green', 'cyan']
for i in range(len(dataset)):
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centre[i][0], centre[i][1], color = 'black')
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')
(ii.)Mall customers
# Fetch the dataset
import pandas as pd
df = pd.read_csv('Mall_Customers.csv')
df.head()
for i in range(1,11):
mnkmeans=
MiniBatchKMeans(n_clusters=i,batch_size = 20,
random_state=42)
mnkmeans.fit(X)
wcss.append(mnkmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title("The Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()
plt.scatter(kmeans.cluster_centers_[:,0],
kmeans.cluster_centers_[:,1],s=300, c='yellow',
label='Centroid')
plt.title("Cluster of Customers")
plt.xlabel("Annual income(k$)")
plt.ylabel("Spending income(1-100)")
plt.legend()
plt.show()
(iii.)make_moons
from sklearn.datasets import make_moons as mm
centre = [[0, 0], [500, 0]]
dataset, labels = mm(n_samples=500, noise=0.05)
color = ['orange', 'purple', 'green', 'cyan']
for i in range(len(dataset)):
plt.scatter(dataset[i][0], dataset[i][1], color =
color[labels[i]], alpha = 0.6)
kmeans = KMeans(n_clusters=2, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centre[i][0], centre[i][1], color = 'black')
kmeans = KMeans(n_clusters=2, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')
# Mini Batch Kmeans
kmeans = MiniBatchKMeans(n_clusters=2, batch_size =
20, random_state=0)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], color = color[a[i]],
alpha = 0.3)
for i in range(len(centers)):
plt.scatter(centers[i][0], centers[i][1], color = 'black')
(iv.)MNIST
from sklearn.datasets import load_digits
import numpy as np
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
t0 = time()
estimator = make_pipeline(StandardScaler(),
kmeans).fit(data)
fit_time = time() - t0
results = [name, fit_time, estimator[-1].inertia_]
print(82 * "_")
print("init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\
tAMI\tsilhouette")
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++",
data=data, labels=labels)
print(82 * "_")
reduced_data =
PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++",
n_clusters=n_digits, n_init=4)
start = tm.default_timer()
kmeans.fit(dataset)
end = tm.default_timer()
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
kmeans.fit(reduced_data)
a = kmeans.labels_
centers = kmeans.cluster_centers_
print(end-start)
kmeans.fit(reduced_data)
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
(ii).make_blobs
from sklearn.datasets import make_blobs as mb
import matplotlib.pyplot as plt
import timeit as tm
import warnings
warnings.filterwarnings('ignore')
centre = [[0, 0], [1000, 0], [1000, 1000], [0, 1000]]
dataset, labels = mb(n_samples = 1000, centers =
centre, cluster_std = 200)
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=1000, centers=3,
n_features=2,random_state=0)
#print(X)
#print(y)
import scipy.cluster.hierarchy as sch
dendrogram= sch.dendrogram(sch.linkage(X, 'ward'))
X, y = dataset
# ============
# Create cluster objects
# ============
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward"
)
complete = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="complete"
)
average = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"],
linkage="average"
)
single = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="single"
)
clustering_algorithms = (
("Single Linkage", single),
("Average Linkage", average),
("Complete Linkage", complete),
("Ward Linkage", ward),
)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
plt.subplot(len(datasets),
len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(
list(
islice(
cycle(
[
"#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
]
),
int(max(y_pred) + 1),
)
)
)
plt.scatter(X[:, 0], X[:, 1], s=10,
color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(
0.99,
0.01,
("%.2fs" % (t1 - t0)).lstrip("0"),
transform=plt.gca().transAxes,
size=15,
horizontalalignment="right",
)
plot_num +=1
plt.show()
(iv.)MNIST
from sklearn.datasets import load_digits
import numpy as np
digits = load_digits()
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape,
np.unique(labels).size
print(f"# digits: {n_digits}; # samples: {n_samples}; #
features {n_features}")
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
import numpy as np
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
iris = load_digits()
X = digits.data
model = model.fit(X)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if
no parenthesis).")
plt.show()
X= df.iloc[:,[3,4]].values
from sklearn.neighbors import NearestNeighbors
nn= NearestNeighbors(n_neighbors=2)
nbrs=nn.fit(X)
distances, indices=nbrs.kneighbors(X)
distances = np.sort(distances, axis = 0) # sorting the distances
distances = distances[:, 1] # taking the second column of the
sorted distances
plt.rcParams['figure.figsize'] = (5,3) # setting the figure size
plt.plot(distances) # plotting the distances
plt.show() # showing the plot
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_,
dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Plot result
import matplotlib.pyplot as plt
%matplotlib inline
class_member_mask = (labels == k)
(iii.)make_moons
import pandas as pd
import numpy as np
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics import v_measure_score
X, y = make_moons(n_samples=500, noise=0.1)
df=pd.DataFrame(X,y)
df=df.rename(columns={0: "X1", 1:"X2"})
#df.head()
dbscan_cluster1 = DBSCAN()
dbscan_cluster1.fit(X)
# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster1.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Number of Clusters
labels=dbscan_cluster1.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)
# Identify Noise
n_noise = list(dbscan_cluster1.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)
# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
dbscan_cluster = DBSCAN(eps=0.1, min_samples=8)
dbscan_cluster.fit(X)
# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Number of Clusters
labels=dbscan_cluster.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)
# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)
# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
nearest_neighbors= NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df)
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex',
direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
print(distances[knee.knee])
# Visualizing DBSCAN
plt.scatter(X[:, 0],
X[:, 1],
c=dbscan_cluster.labels_,
label=y)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
# Number of Clusters
labels=dbscan_cluster.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)
# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)
# Calculating v_measure
print('v_measure =', v_measure_score(y, labels))
(iv.)MNIST
import sys
import matplotlib.pyplot as plt
from time import time
from sklearn.datasets import load_digits
digits = load_digits()
fig, axes = plt.subplots(2, 5, figsize=(10,5),
subplot_kw={'xticks':(), 'yticks':()})
for ax, img in zip(axes.ravel(), digits.images):
ax.imshow(img)
# print len(digits.data) #1797
colors =
["#476A2A","#7851B8",'#BD3430','#4A2D4E','#875525',
'#A83683','#4E655E','#853541','#3A3120','#535D
8E']
t0=time()
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca = pca.fit(digits.data)
digits_pca = pca.transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(), digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(), digits_pca[:,1].max())
for i in range(len(digits.data)):
plt.text(digits_pca[i,0], digits_pca[i,1],
str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight':'bold', 'size':9})
plt.title('PCA')
plt.xlabel("first PC")
plt.ylabel("second PC")
print ("PCA time: ", time()-t0)
plt.show()
#feature scaling
t2 = time()
from sklearn.cluster import DBSCAN
import numpy as np
db = DBSCAN(eps=0.122,
min_samples=10).fit(scaled_p)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print ("number of clusters in pca-DBSCAN: ",
n_clusters_)
data['Embarked']=data['Embarked'].fillna(0)
data['Parch']=data['Parch'].fillna(0)
data['SibSp']=data['SibSp'].fillna(0)
data['Age']=data['Age'].fillna(data['Age'].mean())
data['Sex']=data['Sex'].fillna(0)
data['Pclass']=data['Pclass'].fillna(0)
data['Embarked']=data['Embarked'].fillna('0')
lblenc=LabelEncoder()
lblenc.fit(data['Sex'])
data['Sex']=lblenc.transform(data['Sex'])
data['Embarked']=data['Embarked'].replace('S',1)
data['Embarked']=data['Embarked'].replace('C',2)
data['Embarked']=data['Embarked'].replace('Q',3)
data.info()
#df.head()
data=data[['Pclass','Sex','Age','SibSp','Parch','Survived','Embar
ked']]
X=data[['Pclass','Sex','Age','SibSp','Parch','Embarked']]
y=data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33, random_state=44, shuffle =True)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
estimator.append(('NN', MLPClassifier()))
from sklearn.ensemble import VotingClassifier
# Ensemble of Models
estimator = []
estimator.append(('LR',LogisticRegression(solver
='lbfgs',multi_class ='multinomial',max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability =
True)))
estimator.append(('DTC', DecisionTreeClassifier()))
from sklearn.ensemble import VotingClassifier
# Using accuracy_score
score = accuracy_score(y_test, y_pred)
print("Soft Voting Score: ", score)