Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
14 views

Data Science Practical Book - Ipynb

Uploaded by

Tejas Tadka
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views

Data Science Practical Book - Ipynb

Uploaded by

Tejas Tadka
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 21

Data Science Practical Notebook

T.Y.B.Sc(Computer Science)
CS 358 : Data Science Practicals

 Assignment 1 : The Data Science Environment

 Assignment 2 : Statistical Data Analysis

 Assignment 3 : Data Preprocessing

 Assignment 4 : Data Visualization


ASSIGNMENT 1 : THE DATA SCIENCE ENVIRONMENT
SET A

1.

Create and view a data frame


#import the library
import pandas as pd
import numpy as np
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Percentage' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame

index Name Age Percentage


0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21

2.

#print shape >> number of rows - columns


print("Size={}\n Shape={}\nNumber of rows={}\nNumber of Columns={}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
print("\n Feature Names = {}, {}, {}".
format(data.columns[0], data.columns[1], data.columns[2]))

Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3
Feature Names = Name, Age, Percentage
3.

Adding 5 rows and 1 column


data.loc[10] = ['K',21,56 ]
data.loc[11] = ['L',21,None]
data.loc[12] = ['M',None, 45]
data.loc[13] = ['K',21,56]
data.loc[14] = ['O',25,84]
data["Remarks"] = None
data
index Name Age Percentage Remarks
0 A 26 56 null
1 B 28 62 null
2 C 20 42 null
3 D 15 74 null
4 E 20 32 null
5 F 16 63 null
6 G 18 74 null
7 H 17 84 null
8 I 22 96 null
9 J 21 21 null
10 K 21 56 null
11 L 21 null null
12 M null 45 null
13 K 21 56 null
14 O 25 84 null

4.

print("Number of Observations = ", len(data.index))


print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(data.duplicated().value_counts() #number of duplicate values
Number of Observations = 15

Total missing values in a DataFrame :

17

Total missing values in a DataFrame :

17
False 14
True 1
dtype: int64
5.

Removing a column and missing values


data2=data.drop(columns='Remarks')
data2=data2.dropna(axis=0)
#print modified data
data2
index Name Age Percentage
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
10 K 21 56
13 K 21 56
14 O 25 84

6.

Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()

SET B

1.
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.tail(10) #print last 10 rows
data.sample(20) #print 20 random rows
data.head(10) #print first 10 rows
index Height(Inches) Weight(Pounds)
0 65.78331 112.9925
1 71.51521 136.4873
2 69.39874 153.0269
3 68.2166 142.3354
4 67.78781 144.2971
5 68.69784 123.3024
6 69.80204 141.4947
7 70.01472 136.4623
8 67.90265 112.3723
9 66.78236 120.6672

2.

Add column "BMI"


data2=data.assign(BMI=data['Weight(Pounds)']/(data['Height(Inches)']*
data['Height(Inches)']))

3.

print("\n Maximum BMI = ",max(data2['BMI']))


print("\n Minimum BMI = ",min(data2['BMI']))

Maximum BMI = 0.03701443692089851

Minimum BMI = 0.018591137267932455


ASSIGNMENT 2 : STATISTICAL DATA ANALYSIS
SET A :

1.
import numpy as np

#Inserting the two data points

a=np.array((2,3))

b=np.array((4,5))

#Euclidean Distance

print("Euclidean Distance = ", np.linalg.norm(a-b))

Euclidean Distance = 2.8284271247461903

2.
Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'
],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)
Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21

Mean Score = 60.4

Maximum = 96

Minimum = 21

Range = 75

Q3 = 74.0

Q1 = 45.5

IQR = 28.5

3.

Program to find Manhattan distance between all pairs of points


import math
def manhattan(a,b,n):
sum = 0
i = 0
for i in range(n):
sum += abs(a[i]-b[i])
return sum

a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]

n=len(a) #or len(b)


print("Manhattan Distance = ", manhattan(a,b,n))

Manhattan Distance = 29

SET B

1.

data=pd.read_csv('iris.csv')
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()
Number of records for different variety/class attribute

Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64

2.

import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())

Iris Dataset : Column wise Mean and Median

sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30

SET C :

1.

Program to find Minkowskii Distance between two points


from math import *
from decimal import Decimal
def nth_root(value,root):
root_value = 1/float(root)
return round(Decimal(value)**
Decimal(root_value),3)
def minkowski(a,b,n):
return(nth_root(sum(pow(abs(i-j),n)
for i,j in zip(a,b)),n))

a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))
Minkowski Distance = 3.162
2.

import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))
Original array1:
[0 1 3]

Original array1:
[2 4 5]

Cross-correlation of the said arrays:


[[2.33333333 2.16666667]
[2.16666667 2.33333333]]

3.

Create and view a data frame


#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Student' : ["1","2","3","4","5","6","7","8","9","10"],
'Subject 1':[41,62,35,15,21,65,84,75,42,95],
'Subject 2' : [56,62,42,74,32,63,74,84,96,21],
'Subject 3' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Subject 4' : [41,75,84,62,13,56,42,84,95,23],
'Subject 5' : [45,74,62,31,21,54,45,86,95,32]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
Index Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5
0 1 41 56 26 41 45
1 2 62 62 28 75 74
2 3 35 42 20 84 62
3 4 15 74 15 62 31
4 5 21 32 20 13 21
5 6 65 63 16 56 54
6 7 84 74 18 42 45
7 8 75 84 17 84 86
8 9 42 96 22 95 95
9 10 95 21 21 23 32
from pandas.api.types import is_numeric_dtype
from scipy.stats.mstats import gmean
import statistics as stat
print("Subject wise Mean \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Arithmetic Mean = %.2f' % data[col].mean())
print('\t Geometric Mean = %.2f' % gmean(data[col]))
print('\t Harmonic Mean = %.2f' % stat.harmonic_mean(data[col]))
Subject wise Mean

Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27
ASSIGNMENT 3 : DATA PREPROCESSING
SET A

1.

import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data
index Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
a.

data.describe()
index Age Salary
count 9.0 9.0
mean 38.77777777777778 63777.77777777778
std 7.693792591722527 12265.579661982732
min 27.0 48000.0
25% 35.0 54000.0
50% 38.0 61000.0
75% 44.0 72000.0
max 50.0 83000.0

b.

print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows


= {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))

Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4

c.

print("\n first 3 rows from Dataset")


data.head(3)
First 3 rows from dataset

index Country Age Salary Purchased


0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No

2. a.

Applying OneHot Encoding on Country Column


from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_data= pd.DataFrame(enc.fit_transform(data[['Country']]).toarray()
)
enc_data

index 0 1 2 3
0 1.0 0.0 0.0 0.0
1 0.0 0.0 0.0 1.0
2 0.0 0.0 1.0 0.0
3 0.0 0.0 0.0 1.0
4 0.0 0.0 1.0 0.0
5 1.0 0.0 0.0 0.0
6 0.0 0.0 0.0 1.0
7 0.0 1.0 0.0 0.0
8 0.0 0.0 1.0 0.0
9 1.0 0.0 0.0 0.0

data_merge= data.join(enc_data)
data_merge

index Country Age Salary Purchased 0 1 2 3


0 France 44.0 72000.0 No 1.0 0.0 0.0 0.0
1 Spain 27.0 48000.0 Yes 0.0 0.0 0.0 1.0
2 Germany 30.0 54000.0 No 0.0 0.0 1.0 0.0
3 Spain 38.0 61000.0 No 0.0 0.0 0.0 1.0
4 Germany 40.0 NaN Yes 0.0 0.0 1.0 0.0
5 France 35.0 58000.0 Yes 1.0 0.0 0.0 0.0
6 Spain NaN 52000.0 No 0.0 0.0 0.0 1.0
7 France 48.0 79000.0 Yes 0.0 1.0 0.0 0.0
8 Germany 50.0 83000.0 No 0.0 0.0 1.0 0.0
9 France 37.0 67000.0 Yes 1.0 0.0 0.0 0.0

b.
Applying label encoding on purchased column
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['Purchased'] = labelencoder.fit_transform(data['Purchased'])
data

index Country Age Salary Purchased


0 France 44.0 72000.0 0
1 Spain 27.0 48000.0 1
2 Germany 30.0 54000.0 0
3 Spain 38.0 61000.0 0
4 Germany 40.0 NaN 1
5 France 35.0 58000.0 1
6 Spain NaN 52000.0 0
7 France 48.0 79000.0 1
8 Germany 50.0 83000.0 0
9 France 37.0 67000.0 1
#The purchased labels are replaces by numbers 0 and 1,where 'No' is
assigned 0, and 'Yes' is assigned 1.

SET B

1.

# Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))

Min Max Scaled Data

[[0.248 ………………………………, 0.4]


………………………………………………,
[0.124 ………………………, 0.4 0.6]]
2.

# Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))

Standardized Data

[[-0.528 ………………………… ]
[………………………………,
[………………………………………, 0.45084835]]

Standardized Mean : 0.0


Standardized Standard Deviation : 1.0

3.

# Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data

Normalized Data

array([[0.099…………………………………….],
[………………………………………….., 0.06487013]])

4.

# Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).trans
form(data)
print("\n Binarized Data \n ")
binarized_data

Binarized Data

array([[1., 1., 0., ..., 1., 1., 1.],


[1., …………......, 1., 1., 1.],
[1., 1., 1., ..., 1., 1., 1.]])
SET C

1.

import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average
','Excellent'])
data.head(10)

index Student_id Age Grade Employed marks bucket


0 1 19 1st Class yes 29 Poor
1 2 20 2nd Class no 41 Below_average
2 3 18 1st Class no 57 Average
3 4 21 2nd Class no 29 Poor
4 5 19 1st Class no 57 Average
5 6 20 2nd Class yes 53 Average
6 7 19 3rd Class yes 78 Above_average
7 8 21 3rd Class yes 70 Above_average
8 9 22 3rd Class yes 97 Excellent
9 10 21 1st Class no 58 Average
ASSGNMENT 4 : DATA VISUALIZATION
SET A

1.

from matplotlib import pyplot as plt


import numpy as np
# generate random array using NumPy
a1 = np.random.randn(50)
a2 = np.random.randn(50)
plt.plot(a1,color="k",linewidth=1,linestyle=':')
plt.title("Line Chart")
plt.show()

plt.scatter(a1,a2,c=np.random.randn(50) ,marker ='*',alpha = 0.9)


plt.title("Scatter Plot")
plt.show()

plt.hist(a2,bins=15,facecolor ='lawngreen',edgecolor = "k",alpha=0.7)


print("Histogram")
Histogram
box=plt.boxplot(a2,vert=False,patch_artist = True)
print("Boxplot")

2.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
3.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct
='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()

4.

import seaborn as sns


iris_setosa=data.loc[data["variety"]=="Setosa"]
iris_virginica=data.loc[data["variety"]=="Virginica"]
iris_versicolor=data.loc[data["variety"]=="Versicolor"]

sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add
_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").ad
d_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add
_legend()
plt.show()
SET B

1.

import seaborn as sns


import matplotlib.pyplot as plt

def graph(a):
sns.boxplot(x="variety", y=a, data=data)

plt.figure(figsize=(10,10))

plt.subplot(221)
graph('sepal.length')

plt.subplot(222)
graph('sepal.width')

plt.subplot(223)
graph('petal.length')

plt.subplot(224)
graph('petal.width')

plt.show()
SET C

1.

#Plot to compare all features of iris dataset


import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data,hue='variety', height=2)
plt.show()

s
2.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=
data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()

You might also like