Data Science Practicals - Ipynb
Data Science Practicals - Ipynb
Name :F
Seat no : .
CONTENTS
3 Data Preprocessing
4 Data Visualization
ASSIGNMENT 1 : THE DATA SCIENCE ENVIRONMENT
SET A
In [ ]:
#Q1. Create and view a data frame
#import the library
import pandas as pd
import numpy as np
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Percentage' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
In [ ]:
#Q2.
#print shape >> number of rows - columns
data.shape
Out[ ]: (10, 3)
In [ ]:
print("Size = {} \n Shape = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
Size = 30
Shape = (10, 3)
Number of rows = 10
Number of Columns = 3
In [ ]:
#feature names
print("data types")
data.dtypes
data types
Out[ ]: Name object
Age int64
Percentage int64
dtype: object
In [ ]:
print("Feature Names = {}, {}, {}".
format(data.columns[0], data.columns[1], data.columns[2]))
In [ ]:
print("Description of Data")
data.info()
Description of Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 10 non-null object
1 Age 10 non-null int64
2 Percentage 10 non-null int64
dtypes: int64(2), object(1)
memory usage: 368.0+ bytes
In [ ]: #Number of columns with null entries = 0
#Number of columns with numeric data = 2
#Number of columns with categorical data = 1
#Q3. obtaining basic statistical details of the data
data.describe(include = "all")
In [ ]:
# Mean Age = 20.3 yrs ; Mean % = 60.4 %
# Standard Deviation : sd(Age) = 4.191261 ;sd(%) = 23.381854
# Minimum Age =15 yrs ; Maximum Age = 28 yrs
# Minimum % = 21% ; Maximum % = 96%
0 A 26 56 None
1 B 28 62 None
2 C 20 42 None
3 D 15 74 None
4 E 20 32 None
5 F 16 63 None
6 G 18 74 None
7 H 17 84 None
8 I 22 96 None
9 J 21 21 None
10 K 21 56 None
11 L 21 None None
12 M None 45 None
13 K 21 56 None
14 O 25 84 None
In [ ]:
#Q5.
print("Number of Observations = ", len(data.index))
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(" \nTotal missing values in a DataFrame : \n\n",
data.isnull().sum().sum())
print(data.duplicated().value_counts()) #number of duplicate values
Number of Observations = 15
17
Total missing values in a DataFrame :
17
False 14
True 1
dtype: int64
In [ ]:
#duplicate observations = 1
#Q6. Removing a column and missing values
data2=data.drop(columns='Remarks')
data2=data2.dropna(axis=0)
#print modified data
data2
0 A 26 56
1 B 28 62
2 C 20 42
3 D 15 74
4 E 20 32
5 F 16 63
6 G 18 74
7 H 17 84
8 I 22 96
9 J 21 21
10 K 21 56
13 K 21 56
14 O 25 84
In [ ]:
#Q7. Line plot
import matplotlib.pyplot as plt
data2.plot(x="Name",y="Percentage",
title="Line Plot of Name Vs Percentage")
plt.xlabel("Names")
plt.ylabel("Percentages")
plt.show()
In [ ]:
#Q8. Scatterplot
data2.plot.scatter(x='Name',y='Percentage',
title = "Scatterplot")
plt.show()
SET B
In [ ]:
#Q1.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving SOCR-HeightWeight.csv to SOCR-HeightWeight.csv
In [ ]:
import pandas as pd
data=pd.read_csv('SOCR-HeightWeight.csv')
data.head(10) #print first 10 rows
0 1 65.78331 112.9925
1 2 71.51521 136.4873
2 3 69.39874 153.0269
3 4 68.21660 142.3354
4 5 67.78781 144.2971
Index Height(Inches) Weight(Pounds)
5 6 68.69784 123.3024
6 7 69.80204 141.4947
7 8 70.01472 136.4623
8 9 67.90265 112.3723
9 10 66.78236 120.6672
In [ ]:
data.tail(10) #print last 10 rows
In [ ]:
data.sample(20) #print 20 random rows
In [ ]:
#Q2.
print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
print("\n Datatypes of dataframe object")
data.dtypes
Size = 75000
Shape of DataFrame Object = (25000, 3)
Number of rows = 25000
Number of Columns = 3
In [ ]:
#Q3.
data.describe() #basic statistical details
In [ ]:
#Mean Height = 67.9931 Inches ; Mean Weight = 127.0794 Pounds
#sd(Height) = 1.9017 ; sd(Weight) = 11.6609
#Minimum Height = 60.2784 Inches ; Minimum Weight = 78.0148 Pounds
#Maximum Height = 75.1528 Inches ; Maximum Weight = 170.924 Pounds
#Q4.
print("\n Description of Data")
data.info()
print("\n Number of Observations = ", len(data.index))
print(" \nTotal missing values in a DataFrame = ",data.isnull().sum().sum())
In [ ]:
#Q5.
#Add column "BMI"
data2=data.assign(BMI=data['Weight(Pounds)']/(data['Height(Inches)']*data['Height(Inches)']))
data2.head(1)
In [ ]:
#Q6.
print("Maximum BMI = ",max(data2['BMI']))
print("\n Minimum BMI = ",min(data2['BMI']))
In [ ]:
#Q7.
data.plot(x='Weight(Pounds)',y='Height(Inches)',kind="scatter", title = "ScatterPlot of height vs weight ")
plt.show()
ASSIGNMENT 2 : STATISTICAL DATA ANALYSIS
SET A
In [ ]:
#Q1.
import numpy as np
array = np.array([[0,1],[2,3]])
print("\n Original flattened array: \n", array)
print(" \n Maximum Value of the above flattened array : \n ", np.max(array))
print(" \n Minimum Value of the above flattened array : \n ", np.min(array))
In [ ]:
#Q2.
import numpy as np
#Inserting the two data points
a=np.array((2,3))
b=np.array((4,5))
#Euclidean Distance
print("Euclidean Distance = ", np.linalg.norm(a-b))
In [ ]:
#Q3. Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Scores' : [56,62,42,74,32,63,74,84,96,21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
print(data) #To view the data frame
print("\n Mean Score = ",s.tmean(data["Scores"]) )
print("\n Maximum = ",max(data["Scores"]))
print("\n Minimum = ",min(data["Scores"]))
print("\n Range = ",
max(data["Scores"]) - min(data["Scores"]) )
q3,q1 = np.percentile(data["Scores"],[75,25])
print("\n Q3 = ", q3)
print("\n Q1 = ", q1)
print("\n IQR = ", q3 - q1)
Name Scores
0 A 56
1 B 62
2 C 42
3 D 74
4 E 32
5 F 63
6 G 74
7 H 84
8 I 96
9 J 21
Maximum = 96
Minimum = 21
Range = 75
Q3 = 74.0
Q1 = 45.5
IQR = 28.5
In [ ]:
#Program to find Manhattan Distance between two points
import math
def manhattan(a,b):
return sum(abs(val1 - val2) for val1, val2 in zip(a,b))
#consider any two points
a=[2,3]
b=[4,5]
print ("Points :",a,b)
print("\n Manhattan Distance = ", manhattan(a,b))
Manhattan Distance = 4
In [ ]:
#Q4. Program to find Manhattan distance between all pairs of points
import math
def manhattan(a,b,n):
sum = 0
i = 0
for i in range(n):
sum += abs(a[i]-b[i])
return sum
In [ ]:
#Example
a=[3,5,5,6,5,4,3]
b=[-2,3,2,-5,2,3,-1]
Manhattan Distance = 29
In [ ]:
#Manhattan and Euclidean Distance
import scipy.spatial as sp
In [ ]:
#Q5.
import numpy as np
import matplotlib.pyplot as plt
n=np.array([0.5, 0.7, 1.0, 1.2, 1.3, 2.1])
b=np.array([0,1,2,3])
print("\n nums:",n)
print("\n bins:",b )
print("\n Result: \n",np.histogram(n,b))
print("\n")
plt.hist(n,b)
plt.show()
bins: [0 1 2 3]
Result:
(array([2, 3, 1]), array([0, 1, 2, 3]))
In [ ]:
#Q6.Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Name':['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
'Graduation Percentage' : [56,62,42,74,32,63,74,84,96,21],
'Age' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
0 A 56 26
1 B 62 28
2 C 42 20
3 D 74 15
4 E 32 20
5 F 63 16
6 G 74 18
7 H 84 17
8 I 96 22
9 J 21 21
In [ ]:
print("\n Average age of students = ",s.tmean(data["Age"]) )
print("\n Average Graduation Percentage = ",s.tmean(data["Graduation Percentage"]) )
print("\n All Basic Statistics of Data \n ")
data.describe(include='all')
In [ ]:
print("\n Measures of Dispersion and Position in the Distribution")
r=max(data["Graduation Percentage"]) - min(data["Graduation Percentage"])
print("\n Value of Range in the Distribution = ", r)
s=round(data["Graduation Percentage"].std(),3)
print("Value of Standard Deviation in the Distribution = ", s)
v=round(data["Graduation Percentage"].var(),3)
print("Value of Variance in the Distribution = ", v)
In [ ]:
#Q1.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv
In [ ]:
import pandas as numpy
#Read csv file
data=pd.read_csv('iris.csv')
In [ ]:
data.sample(13)
In [ ]:
from pandas.api.types import is_numeric_dtype
print("Minimum and Maximum for all numeric attributes\n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Minimum = ',data[col].min())
print('\t Maximum = ',data[col].max())
sepal.length:
Minimum = 4.3
Maximum = 7.9
sepal.width:
Minimum = 2.0
Maximum = 4.4
petal.length:
Minimum = 1.0
Maximum = 6.9
petal.width:
Minimum = 0.1
Maximum = 2.5
In [ ]:
#Q2.
print("Number of records for different variety/class attribute \n")
data['variety'].value_counts()
Out[ ]: Versicolor 50
Setosa 50
Virginica 50
Name: variety, dtype: int64
In [ ]:
#Q3.
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("Iris Dataset : Column wise Mean and Median \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Mean = %.2f' % data[col].mean())
print('\t Median = %.2f' % data[col].median())
sepal.length:
Mean = 5.84
Median = 5.80
sepal.width:
Mean = 3.06
Median = 3.00
petal.length:
Mean = 3.76
Median = 4.35
petal.width:
Mean = 1.20
Median = 1.30
SET C
In [ ]:
#Q1. Program to find Minkowskii Distance between two points
from math import *
from decimal import Decimal
def nth_root(value,root):
root_value = 1/float(root)
return round(Decimal(value)**
Decimal(root_value),3)
def minkowski(a,b,n):
return(nth_root(sum(pow(abs(i-j),n)
for i,j in zip(a,b)),n))
In [ ]:
a=[-1,5]
b=[2,4]
n=len(a) #OR root value
print("\n Minkowski Distance = ",minkowski(a,b,n))
In [ ]:
#Q2.
import numpy as np
a = np.arange(9).reshape((3,3))
print("Original flattened array:")
print(a)
print("Weighted average along the specified axis of the above flattened array:")
print(np.average(a, axis=1, weights=[1./4, 2./4, 2./4]))
In [ ]:
#Q3.
import numpy as np
x = np.array([0, 1, 3])
y = np.array([2, 4, 5])
print("\nOriginal array1:")
print(x)
print("\nOriginal array1:")
print(y)
print("\nCross-correlation of the said arrays:\n",np.cov(x, y))
Original array1:
[0 1 3]
Original array1:
[2 4 5]
In [ ]:
#Q4. Wholesale Customers Data from UCI
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Wholesale customers data.csv to Wholesale customers data.csv
In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('Wholesale customers data.csv')
data.describe()
Channel:
Mean = 1.32
Region:
Mean = 2.54
Fresh:
Mean = 12000.30
Milk:
Mean = 5796.27
Grocery:
Mean = 7951.28
Frozen:
Mean = 3071.93
Detergents_Paper:
Mean = 2881.49
Delicassen:
Mean = 1524.87
In [ ]:
#Q5.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving nursery.data.csv to nursery.data.csv
In [2]:
import pandas as pd
#Read csv file
data=pd.read_csv('nursery.data.csv')
In [5]:
data.head(5)
In [10]:
#Group by proper
import numpy as np
data_by_proper=data.groupby('proper')
data_by_proper.count()
proper
proper
In [ ]:
#Q6.Create and view a data frame
#import the library
import pandas as pd
import numpy as np
import scipy.stats as s
#Enter Data
data_values={'Student' : ["1","2","3","4","5","6","7","8","9","10"],
'Subject 1':[41,62,35,15,21,65,84,75,42,95],
'Subject 2' : [56,62,42,74,32,63,74,84,96,21],
'Subject 3' : [26, 28, 20, 15, 20, 16, 18, 17, 22, 21],
'Subject 4' : [41,75,84,62,13,56,42,84,95,23],
'Subject 5' : [45,74,62,31,21,54,45,86,95,32]
}
#Create empty dataframe with column names
data=pd.DataFrame.from_dict(data_values)
data #To view the data frame
0 1 41 56 26 41 45
1 2 62 62 28 75 74
2 3 35 42 20 84 62
3 4 15 74 15 62 31
4 5 21 32 20 13 21
5 6 65 63 16 56 54
6 7 84 74 18 42 45
7 8 75 84 17 84 86
8 9 42 96 22 95 95
Student Subject 1 Subject 2 Subject 3 Subject 4 Subject 5
9 10 95 21 21 23 32
In [ ]:
from pandas.api.types import is_numeric_dtype
from scipy.stats.mstats import gmean
import statistics as stat
print("Subject wise Mean \n")
for col in data.columns:
if is_numeric_dtype(data[col]):
print('%s:' % (col))
print('\t Arithmetic Mean = %.2f' % data[col].mean())
print('\t Geometric Mean = %.2f' % gmean(data[col]))
print('\t Harmonic Mean = %.2f' % stat.harmonic_mean(data[col]))
Subject 1:
Arithmetic Mean = 53.50
Geometric Mean = 46.35
Harmonic Mean = 38.71
Subject 2:
Arithmetic Mean = 60.40
Geometric Mean = 55.41
Harmonic Mean = 49.53
Subject 3:
Arithmetic Mean = 20.30
Geometric Mean = 19.93
Harmonic Mean = 19.58
Subject 4:
Arithmetic Mean = 57.50
Geometric Mean = 49.59
Harmonic Mean = 39.96
Subject 5:
Arithmetic Mean = 54.50
Geometric Mean = 49.33
Harmonic Mean = 44.27
In [ ]:
#Q7.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv
In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('iris.csv')
In [ ]:
! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
In [ ]:
import pandas as pd
data=pd.read_csv('iris.csv')
In [ ]:
import pandas_profiling
profile = data.profile_report(title="Statistal Data Analysis")
profile
Overview
Missing cells 0
Missing cells (%) 0.0%
Duplicate rows 1
Alerts
Dataset has 1 (0.7%) duplicate rows Duplicates
sepal.length is highly correlated with petal.length and 1 other fields (petal.length, High correlation
petal.width)
petal.length is highly correlated with sepal.length and 1 other fields (sepal.length, High correlation
petal.width)
petal.width is highly correlated with sepal.length and 1 other fields (sepal.length, High correlation
petal.length)
sepal.length is highly correlated with petal.length and 1 other fields (petal.length, High correlation
petal.width)
Out[ ]:
In [ ]:
#Saving the file
profile.to_file("Data Analysis.html")
ASSIGNMENT 3 : DATA PREPROCESSING
SET A
In [ ]:
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Data.csv to Data.csv
In [ ]:
import pandas as pd
import io
data = pd.read_csv('Data.csv',sep = ',')
data
In [ ]:
#Q1.a
data.describe()
In [ ]:
#b.)
print("Size = {} \n Shape of DataFrame Object = {}\n Number of rows = {} \n Number of Columns = {}".
format(data.size, data.shape, data.shape[0], data.shape[1]))
Size = 40
Shape of DataFrame Object = (10, 4)
Number of rows = 10
Number of Columns = 4
In [ ]:
#c.)
print("\n first 3 rows from Dataset")
data.head(3)
In [ ]:
#Q2.
#Handling Missing values
data.fillna(data.mean())
In [ ]:
#Q3. a. Applying OneHot Encoding on Country Column
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_data= pd.DataFrame(enc.fit_transform(data[['Country']]).toarray())
enc_data
Out[ ]:
0 1 2 3
In [ ]:
data_merge= data.join(enc_data)
data_merge
In [ ]:
#The purchased labels are replaces by numbers 0 and 1,
# where 'No' is assigned 0, and 'Yes' is assigned 1.
SET B
In [ ]:
#Q1.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving winequality-red.csv to winequality-red.csv
In [ ]:
import pandas as pd
#Read csv file
data=pd.read_csv('winequality-red.csv',sep=';')
data.shape
In [ ]:
#Q2. Rescaling Data
import pandas, scipy, numpy
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
array=data.values
#Separating data into input and output components
data_scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(array)
print("\n Min Max Scaled Data \n \n ")
print(data_scaled.round(3))
In [ ]:
# This gives us values between 0 and 1.
# Rescaling data proves of use with neural networks,
# optimization algorithms and those that use distance measures like
# k-nearest neighbors and weight inputs like regression.
In [ ]:
#Q3. Standardizing Data
from sklearn.preprocessing import StandardScaler
import scipy.stats as s
scaler=StandardScaler().fit(data)
std_data=scaler.transform(data)
print("\n Standardized Data \n ")
print(std_data)
print("\n Standardized Mean : ",s.tmean(std_data).round(2))
print(" Standardized Standard Deviation : ",round(std_data.std(),2))
Standardized Data
In [ ]:
#Q4. Normalizing Data
import numpy as np
import pandas as pd
import scipy.stats as s
from sklearn import preprocessing
norm_data=preprocessing.normalize(data,norm='l1')
print("\n Normalized Data \n ")
norm_data
Normalized Data
In [ ]:
#Q5. Binarizing Data
binarized_data=preprocessing.Binarizer(threshold=0.0).fit(data).transform(data)
print("\n Binarized Data \n ")
binarized_data
Binarized Data
In [ ]:
#Q1.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving Student_bucketing.csv to Student_bucketing.csv
In [ ]:
import pandas as pd
import io
data= pd.read_csv('Student_bucketing.csv')
data=pd.DataFrame(data)
In [ ]:
#Q2.
print("First 5 Rows of the dataset \n ")
data.head(5)
1 2 20 2nd Class no 41
2 3 18 1st Class no 57
3 4 21 2nd Class no 29
4 5 19 1st Class no 57
In [ ]:
#Q3.
import pandas as pd
data['bucket']=pd.cut(data['marks'],5,
labels=['Poor','Below_average','Average','Above_average','Excellent'])
data.head(10)
SET A
In [ ]:
#Q1.
from matplotlib import pyplot as plt
import numpy as np
# generate random array using NumPy
a1 = np.random.randn(50)
a2 = np.random.randn(50)
plt.plot(a1,color="k",linewidth=1,linestyle=':')
plt.title("Line Chart")
plt.show()
In [ ]:
plt.scatter(a1,a2,c=np.random.randn(50) ,marker ='*',alpha = 0.9)
plt.title("Scatter Plot")
plt.show()
In [ ]:
plt.hist(a2,bins=15,facecolor ='lawngreen',edgecolor = "k",alpha=0.7)
print("Histogram")
Histogram
In [ ]:
box=plt.boxplot(a2,vert=False,patch_artist = True)
print("Boxplot")
Boxplot
In [ ]:
#Q2.
a3=np.append(a2,[[5,-4]])
plt.boxplot(a3,vert=False)
print("Boxplot with outliers")
plt.show()
In [ ]:
#Q3.
from matplotlib import pyplot as plt
import numpy as np
subjects=['English','Comp Sci','Maths','Physics','Statistics','Algebra','Mechanics']
marks =[45,74,62,31,21,87,95]
plt.pie(marks,labels = subjects,autopct='%1.1f%%')
print("Pie Plot")
plt.show()
Pie Plot
In [ ]:
print("Bar Plot")
bar=plt.bar(subjects,marks,color='g')
def gradientbars(bars):
grad = np.atleast_2d(np.linspace(0,1,256)).T
ax = bars[0].axes
lim = ax.get_xlim()+ax.get_ylim()
for bar in bars:
bar.set_zorder(1)
bar.set_facecolor("none")
x,y = bar.get_xy()
w, h = bar.get_width(), bar.get_height()
ax.imshow(grad, extent=[x,x+w,y,y+h], aspect="auto", zorder=0)
ax.axis(lim)
gradientbars(bar)
plt.show()
Bar Plot
In [ ]:
#Q4.
from google.colab import files
data=files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please
rerun this cell to enable.
Saving iris.csv to iris.csv
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
sns.countplot(x='variety',data = data)
plt.title("Iris Species Count")
plt.show()
In [ ]:
#Q5.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
ax=plt.subplots(1,1,figsize=(10,8))
data['variety'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True,figsize=(10,8))
plt.title("Iris Species %")
plt.show()
In [ ]:
#Q6.
import seaborn as sns
iris_setosa=data.loc[data["variety"]=="Setosa"]
iris_virginica=data.loc[data["variety"]=="Virginica"]
iris_versicolor=data.loc[data["variety"]=="Versicolor"]
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.length").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"petal.width").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.length").add_legend()
sns.FacetGrid(data,hue="variety").map(sns.histplot,"sepal.width").add_legend()
plt.show()
SET B
In [ ]:
#Q1.
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("iris.csv")
fig = data[data.variety=='Setosa'].plot.scatter(x='petal.length',y='petal.width',color='cyan', label='Setosa')
data[data.variety=='Versicolor'].plot.scatter(x='petal.length',y='petal.width',color='violet', label='versicolor',ax=fig)
data[data.variety=='Virginica'].plot.scatter(x='petal.length',y='petal.width',color='lawngreen', label='virginica', ax=fig)
fig.set_xlabel("Petal Length")
fig.set_ylabel("Petal Width")
fig.set_title(" Petal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.show()
In [ ]:
#Q2.
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv("iris.csv")
fig = data[data.variety=='Setosa'].plot.scatter(x='sepal.length',y='sepal.width',color='g', label='Setosa')
data[data.variety=='Versicolor'].plot.scatter(x='sepal.length',y='sepal.width',color='r', label='versicolor',ax=fig)
data[data.variety=='Virginica'].plot.scatter(x='sepal.length',y='sepal.width',color='gold', label='virginica', ax=fig)
fig.set_xlabel("Sepal Length")
fig.set_ylabel("Sepal Width")
fig.set_title(" Sepal Length VS Width")
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.show()
In [ ]:
#Q3.
import seaborn as sns
import matplotlib.pyplot as plt
def graph(a):
sns.boxplot(x="variety", y=a, data=data)
plt.figure(figsize=(10,10))
plt.subplot(221)
graph('sepal.length')
plt.subplot(222)
graph('sepal.width')
plt.subplot(223)
graph('petal.length')
plt.subplot(224)
graph('petal.width')
plt.show()
SET C
In [ ]:
#Q1.
#Plot to compare all features of iris dataset
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data,hue='variety', height=2)
plt.show()
In [ ]: #Q2.
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.subplot(221)
sns.boxplot(x="variety", y="sepal.length", data=data,palette="bwr");
plt.subplot(222)
sns.boxplot(x="variety", y="sepal.width", data=data,palette="magma")
plt.subplot(223)
sns.boxplot(x="variety", y="petal.length", data=data,palette="autumn")
plt.subplot(224)
sns.boxplot(x="variety", y="petal.width", data=data,palette="GnBu")
plt.show()
In [ ]:
#Q3.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data= pd.read_csv("iris.csv")
g = sns.jointplot(x="sepal.length", y="sepal.width",shade=True, data=data, kind="kde", color="b")
g.plot_joint(plt.scatter, c="gold", s=40, linewidth=1, marker="*")
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels("$SepalLength$", "$SepalWidth$")
plt.show()
THE END