PCA Code-Checkpoint
PCA Code-Checkpoint
In [1]:
import numpy as np
import pandas as pd
In [2]:
A = np.matrix([[1,2,3,4],
[5,5,6,7],
[1,4,2,3],
[5,3,2,1],
[8,1,2,2]])
In [3]:
df = pd.DataFrame(A,columns = ['f1','f2','f3','f4'])
df
Out[3]:
f1 f2 f3 f4
0 1 2 3 4
1 5 5 6 7
2 1 4 2 3
3 5 3 2 1
4 8 1 2 2
In [4]:
df_std = (df - df.mean()) / (df.std())
df_std
Out[4]:
f1 f2 f3 f4
Sample formula
Population formula
In [5]:
df_cov = np.cov(df_std.T, bias = 1)
df_cov
Out[5]:
array([[ 0.8 , -0.25298221, 0.03849002, -0.14479075],
[-0.25298221, 0.8 , 0.51120772, 0.49449803],
[ 0.03849002, 0.51120772, 0.8 , 0.75235479],
[-0.14479075, 0.49449803, 0.75235479, 0.8 ]])
In [6]:
cov_mat = np.cov(df_std.T, bias = 0)
cov_mat
Out[6]:
array([[ 1. , -0.31622777, 0.04811252, -0.18098843],
[-0.31622777, 1. , 0.63900965, 0.61812254],
[ 0.04811252, 0.63900965, 1. , 0.94044349],
[-0.18098843, 0.61812254, 0.94044349, 1. ]])
In [9]:
## verify varinace(f1) is as expected
print('var(f1) (population formula): ',((df_std.f1)**2).sum()/5)
print('var(f1) (sample formula): ',((df_std.f1)**2).sum()/4)
In [11]:
## verify covarinace(f1,f2) is as expected
print('covar(f1,f2) (population formula): ',((df_std.f1)*(df_std.f2)).sum()/5)
print('covar(f1,f2) (sample formula): ',((df_std.f1)*(df_std.f2)).sum()/4)
In [13]:
print(eigen_val)
In [14]:
print(eigen_vectors)
Since the eigen values are already sorted in our case, so no need of this step
In [75]:
n_components=3
pick
In [76]:
top_eigen_vectors = eigen_vectors[:,:n_components]
In [77]:
top_eigen_vectors
Out[77]:
array([[ 0.16195986, -0.91705888, -0.30707099],
[-0.52404813, 0.20692161, -0.81731886],
[-0.58589647, -0.3205394 , 0.1882497 ],
[-0.59654663, -0.11593512, 0.44973251]])
In [78]:
top_eigen_vectors.shape
Out[78]:
(4, 3)
In [79]:
np.array(df_std).shape
Out[79]:
(5, 4)
In [80]:
transformed_data = np.matmul(np.array(df_std),top_eigen_vectors)
In [85]:
pd.DataFrame(data = transformed_data
, columns = ['principal component '+ str(i+1) for i in range(n_components)]
)
Out[85]:
In [82]:
transformed_data.shape
Out[82]:
(5, 3)
In [84]:
principalDf
Out[84]:
In [ ]:
In [ ]: