Data Science Practical Book - Ipynb
Data Science Practical Book - Ipynb
T.Y.B.Sc(Computer Science)
CS 358 : Data Science Practicals
1.
2.
Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3
Feature Names = Name, Age, Percentage
3.
4.
17
17
False 14
True 1
dtype: int64
5.
6.
Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()
SET B
1.
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.tail(10) #print last 10 rows
data.sample(20) #print 20 random rows
data.head(10) #print first 10 rows
index Height(Inches) Weight(Pounds)
0 65.78331 112.9925
1 71.51521 136.4873
2 69.39874 153.0269
3 68.2166 142.3354
4 67.78781 144.2971
5 68.69784 123.3024
6 69.80204 141.4947
7 70.01472 136.4623
8 67.90265 112.3723
9 66.78236 120.6672
2.
3.
1.
import numpy as np
a=np.array((2,3))
b=np.array((4,5))
#Euclidean Distance
2.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)
Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21
Maximum = 96
Minimum = 21
Range = 75
Q3 = 74.0
Q1 = 45.5
IQR = 28.5
3.
a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]
Manhattan Distance = 29
SET B
1.
data=pd.read_csv('iris.csv')
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()
Number of records for different variety/class attribute
Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64
2.
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())
sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30
SET C :
1.
a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))
Minkowski Distance = 3.162
2.
import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))
Original array1:
[0 1 3]
Original array1:
[2 4 5]
3.
Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27
ASSIGNMENT 3 : DATA PREPROCESSING
SET A
1.
import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data
index Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
a.
data.describe()
index Age Salary
count 9.0 9.0
mean 38.77777777777778 63777.77777777778
std 7.693792591722527 12265.579661982732
min 27.0 48000.0
25% 35.0 54000.0
50% 38.0 61000.0
75% 44.0 72000.0
max 50.0 83000.0
b.
Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4
c.
2. a.
index 0 1 2 3
0 1.0 0.0 0.0 0.0
1 0.0 0.0 0.0 1.0
2 0.0 0.0 1.0 0.0
3 0.0 0.0 0.0 1.0
4 0.0 0.0 1.0 0.0
5 1.0 0.0 0.0 0.0
6 0.0 0.0 0.0 1.0
7 0.0 1.0 0.0 0.0
8 0.0 0.0 1.0 0.0
9 1.0 0.0 0.0 0.0
data_merge= data.join(enc_data)
data_merge
b.
Applying label encoding on purchased column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['Purchased'] = labelencoder.fit_transform(data['Purchased'])
data
SET B
1.
# Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))
# Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))
Standardized Data
[[-0.528 ………………………… ]
[………………………………,
[………………………………………, 0.45084835]]
3.
# Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data
Normalized Data
array([[0.099…………………………………….],
[………………………………………….., 0.06487013]])
4.
# Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).trans
form(data)
print("\n Binarized Data \n ")
binarized_data
Binarized Data
1.
import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average
','Excellent'])
data.head(10)
1.
2.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
3.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct
='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()
4.
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add
_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add
_legend()
plt.show()
SET B
1.
def graph(a):
sns.boxplot(x="variety", y=a, data=data)
plt.figure(figsize=(10,10))
plt.subplot(221)
graph('sepal.length')
plt.subplot(222)
graph('sepal.width')
plt.subplot(223)
graph('petal.length')
plt.subplot(224)
graph('petal.width')
plt.show()
SET C
1.
s
2.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=
data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()