Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
29 views

ML Assignment 01 Code

The document discusses performing principal component analysis (PCA) on the Iris dataset using Python and Scikit-learn. It loads and explores the Iris data, performs PCA to reduce the dimensions, and analyzes the results including the number of components retained, explained variance, and feature contributions to the principal components.

Uploaded by

Awais Khan
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
29 views

ML Assignment 01 Code

The document discusses performing principal component analysis (PCA) on the Iris dataset using Python and Scikit-learn. It loads and explores the Iris data, performs PCA to reduce the dimensions, and analyzes the results including the number of components retained, explained variance, and feature contributions to the principal components.

Uploaded by

Awais Khan
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 21

Title

No category Today 8:07 PM

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA

# Load the Iris dataset


iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names

# Create a Pandas DataFrame from


the Iris dataset
df = pd.DataFrame(data,
columns=columns)

# Step 1: Mean normalize the features


normalized_data = (df - df.mean()) /
df.std()

# Step 2: Find the covariance matrix


covariance_matrix =
np.cov(normalized_data,
rowvar=False)

# Step 3: Find eigenvalues and


eigenvectors of the covariance matrix
eigenvalues, eigenvectors =
np.linalg.eig(covariance_matrix)

# Step 4: Arrange eigenvalues in


descending order
sorted_indices =
np.argsort(eigenvalues)[::-1]
sorted_eigenvalues =
eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:,
sorted_indices]

# Step 5: Select eigenvalues that


retain the required variance
total_variance =
np.sum(sorted_eigenvalues)
variance_to_retain = 0.95
cumulative_variance =
np.cumsum(sorted_eigenvalues) /
total_variance
num_components_to_retain =
np.argmax(cumulative_variance >=
variance_to_retain) + 1

selected_eigenvalues =
sorted_eigenvalues[:num_component
s_to_retain]
selected_eigenvectors =
sorted_eigenvectors[:, :num_compone
nts_to_retain]
# Step 6: Transform original data
using eigen vectors corresponding to
selected eigenvalues
transformed_data =
np.dot(normalized_data,
selected_eigenvectors)

# Print the results


print(f"Number of components to
retain {variance_to_retain * 100}%
variance:
{num_components_to_retain}")
print("Explained variance ratio:",
selected_eigenvalues /
total_variance)

# Interpret which features influenced


the principal components the most
feature_contributions =
np.abs(selected_eigenvectors) /
np.sum(np.abs(selected_eigenvectors
), axis=0)
feature_contributions_df =
pd.DataFrame(feature_contributions,
index=columns,

columns=[f'PC{i + 1}' for i in


range(num_components_to_retain)])
print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)

Code 02

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA

def load_iris_data():
iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names
return pd.DataFrame(data,
columns=columns), iris.target

def display_correlation_matrix(df):
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)

def normalize_features(df):
return (df - df.mean()) / df.std()

def perform_pca(data, target):


pca = PCA(n_components=0.95)
transformed_data =
pca.fit_transform(data)
plot_before_after_pca(data,
transformed_data, target)
display_pca_info(pca)

display_feature_contributions(pca.co
mponents_, data.columns)

def
plot_before_after_pca(original_data,
transformed_data, target):
plt.figure(figsize=(12, 6))
# Original Data
plt.subplot(1, 2, 1)
plt.scatter(original_data.iloc[:, 0],
original_data.iloc[:, 1], c=target,
cmap='Set1')
plt.title('Original Data')
plt.xlabel('Feature 01')
plt.ylabel('Feature 02')

# Data after PCA


plt.subplot(1, 2, 2)
plt.scatter(transformed_data[:, 0],
transformed_data[:, 1], c=target,
cmap='Set1')
plt.title('Data after PCA')
plt.xlabel('Principal Component 01')
plt.ylabel('Principal Component 02')

plt.savefig('output_plot.png')
plt.show()

def display_pca_info(pca):
print(f"\nNumber of components to
retain 95% variance:
{pca.n_components_}")
print("Explained variance ratio:",
pca.explained_variance_ratio_)

def
display_feature_contributions(compo
nents, columns):
feature_contributions =
np.abs(components) /
np.sum(np.abs(components), axis=1)
[:, np.newaxis]
feature_contributions_df =
pd.DataFrame(feature_contributions.T
, index=columns,

columns=[f'PC{i + 1}' for i in


range(components.shape[0])])

print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)
# Plotting the feature contributions
plt.figure(figsize=(12, 6))
for i in
range(components.shape[0]):
plt.subplot(1,
components.shape[0], i + 1)
plt.bar(columns,
feature_contributions_df.iloc[:, i])
plt.title(f'PC{i + 1} Feature
Contributions')
plt.xlabel('Original Features')
plt.ylabel('Contribution')
plt.savefig('output_plot_feature_contri
butions.png')
plt.show()

def main():
iris_data, target = load_iris_data()

display_correlation_matrix(iris_data)
normalized_data =
normalize_features(iris_data)
perform_pca(normalized_data,
target)

if __name__ == "__main__":
main()
Code 03

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import
PCA

# Load the Iris dataset


iris = datasets.load_iris()
data, columns = iris.data,
iris.feature_names
# Create a Pandas DataFrame from
the Iris dataset
df = pd.DataFrame(data,
columns=columns)

# Display the correlation matrix


correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Mean normalize the features


normalized_data = (df - df.mean()) /
df.std()
# Perform PCA using Scikit-learn
pca = PCA(n_components=0.95) #
Retain 95% variance
transformed_data =
pca.fit_transform(normalized_data)

# Plotting the data before and after


PCA
plt.figure(figsize=(12, 6))

# Original Data
plt.subplot(1, 2, 1)
plt.scatter(normalized_data.iloc[:, 0],
normalized_data.iloc[:, 1],
c=iris.target, cmap='Set1')
plt.title('Original Data')
plt.xlabel('Feature 01')
plt.ylabel('Feature 02')

# Data after PCA


plt.subplot(1, 2, 2)
plt.scatter(transformed_data[:, 0],
transformed_data[:, 1], c=iris.target,
cmap='Set1')
plt.title('Data after PCA')
plt.xlabel('Principal Component 01')
plt.ylabel('Principal Component 02')

plt.savefig('output_plot.png')
plt.show()
# Display the number of components
and explained variance ratio
print(f"\nNumber of components to
retain 95% variance:
{pca.n_components_}")
print("Explained variance ratio:",
pca.explained_variance_ratio_)

# Interpreting which features


influenced the principal components
the most
feature_contributions =
np.abs(pca.components_) /
np.sum(np.abs(pca.components_),
axis=1)[:, np.newaxis]
feature_contributions_df =
pd.DataFrame(feature_contributions.T
, index=columns,

columns=[f'PC{i + 1}' for i in


range(pca.n_components_)])

print("\nFeature contributions to
Principal Components:")
print(feature_contributions_df)

# Plotting the feature contributions


plt.figure(figsize=(12, 6))
for i in range(pca.n_components_):
plt.subplot(1, pca.n_components_, i
+ 1)
plt.bar(columns,
feature_contributions_df.iloc[:, i])
plt.title(f'PC{i + 1} Feature
Contributions')
plt.xlabel('Original Features')
plt.ylabel('Contribution')

plt.savefig('output_plot_feature_contri
butions.png')
plt.show()

You might also like