Question 1. Tips dataset

• Read the dataset “Tips.csv” as a dataframe “Data”.

• Extract the columns in the following sequence – Time, TotalBill, Tips.

• Plot a histogram for the variable ‘TotalBill’ to check which range has the highest

• Draw a bar chart for the variable “Day”. Identify the category with the maximum

• Demonstrate the data distributions using box, scatter plot, histogram, and bar chart
on iris


• Demonstrate the correlation plot on iris dataset and perform exploratory visualization
giving an

overview of relationships among data with covariance analysis.


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Task 1: Read the "Tips.csv" dataset as a dataframe "Data"

Data = pd.read_csv("tips.csv")

# Task 2: Extract the columns in the sequence Time, TotalBill, Tips

Data = Data[['total_bill', 'tip', 'day']]

# Task 3: Plot a histogram for the variable 'total_bill'

plt.figure(figsize=(8, 6))
plt.hist(Data['total_bill'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of TotalBill')

# Task 4: Draw a bar chart for the variable "day". Identify the category with the maximum
plt.figure(figsize=(8, 6))
sns.countplot(x='day', data=Data, hue='day', palette='viridis', legend=False)
plt.title('Bar Chart for Day')

# Task 5: Load the iris dataset from the local file

iris = pd.read_csv("iris.csv")

# Task 6: Demonstrate data distributions using box, scatter plot, histogram, and bar chart
# Box plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=iris[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']])
plt.title('Box Plot of Iris Dataset')

# Scatter plot
plt.title('Scatter Plot of Iris Dataset')

# Histogram
plt.figure(figsize=(8, 6))
sns.histplot(data=iris[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']], kde=True)
plt.title('Histogram of Iris Dataset')

# Bar chart
plt.figure(figsize=(8, 6))
#sns.countplot(x='Flowers', data=iris, palette='Set2')
sns.countplot(x='Flowers', data=iris, hue='Flowers', palette='Set2', legend=False)
plt.title('Bar Chart of Flowers in Iris Dataset')

# Load the iris dataset

iris = pd.read_csv("iris.csv")

# Task 7: Demonstrate the correlation plot on the iris dataset

# Convert the 'Flowers' column to numerical values
iris['Flowers'] = iris['Flowers'].astype('category').cat.codes
plt.figure(figsize=(10, 8))
sns.heatmap(iris.corr(), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Plot of Iris Dataset')

# Task 8: Perform exploratory visualization with covariance analysis

# Pair plot with hue based on species
sns.pairplot(iris, hue='Flowers', palette='viridis', markers=["o", "s", "D"])
plt.suptitle('Pair Plot of Iris Dataset with Species Hue', y=1.02)

# Covariance matrix
covariance_matrix = iris.cov()

# Print covariance matrix

print("Covariance Matrix:")

Covariance Matrix:

sepallength sepalwidth petallength petalwidth Flowers

sepallength 0.685694 -0.039268 1.273682 0.516904 0.530872

sepalwidth -0.039268 0.188004 -0.321713 -0.117981 -0.148993

petallength 1.273682 -0.321713 3.113179 1.296387 1.371812

petalwidth 0.516904 -0.117981 1.296387 0.582414 0.597987

Flowers 0.530872 -0.148993 1.371812 0.597987 0.671141

Question 2:. Split the Iris dataset into two the datasets - IrisTest_TrainData.csv,

• Read them as two separate data frames named Train_Data and Test_Data

• Answer the following questions:

➢ How many missing values are there in Train_Data?

➢ What is the proportion of Setosa types in the Test_Data?

➢ What is the accuracy score of the K-Nearest Neighbor model (model_1) with 2/3

neighbors using Train_Data and Test_Data?

➢ Identify the list of indices of misclassified samples from the ‘model_1’.

➢ Build a logistic regression model (model_2) keeping the modelling steps constant.

the accuracy of the model_2


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Iris dataset

iris_data = pd.read_csv('iris.csv') # Replace with the correct file path

# Split the dataset into training and testing sets

Train_Data, Test_Data = train_test_split(iris_data, test_size=0.2, random_state=42)

# Save the split datasets to CSV files

Train_Data.to_csv('IrisTest_TrainData.csv', index=False)
Test_Data.to_csv('IrisTest_TestData.csv', index=False)

# 1. How many missing values are there in Train_Data?

missing_values_train = Train_Data.isnull().sum().sum()
print(f"Number of missing values in Train_Data: {missing_values_train}")

# 2. What is the proportion of Setosa types in Test_Data?

setosa_proportion = Test_Data[Test_Data['Flowers'] == 'Iris-setosa'].shape[0] / Test_Data.shape[0]
print(f"Proportion of Setosa types in Test_Data: {setosa_proportion}")

# 3. Train the K-Nearest Neighbor model (model_1) with 2/3 neighbors and calculate accuracy
features = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
model_1 = KNeighborsClassifier(n_neighbors=2) # You can adjust the number of neighbors
model_1.fit(Train_Data[features], Train_Data['Flowers'])
predictions_model_1 = model_1.predict(Test_Data[features])
accuracy_model_1 = accuracy_score(Test_Data['Flowers'], predictions_model_1)
print(f"Accuracy score of model_1: {accuracy_model_1}")

# 4. Identify the list of indices of misclassified samples from 'model_1'.

misclassified_indices = Test_Data.index[Test_Data['Flowers'] != predictions_model_1].tolist()
print(f"Misclassified sample indices from model_1: {misclassified_indices}")

# 5. Train the Logistic Regression model (model_2) and find its accuracy
model_2 = LogisticRegression()
model_2.fit(Train_Data[features], Train_Data['Flowers'])
predictions_model_2 = model_2.predict(Test_Data[features])
accuracy_model_2 = accuracy_score(Test_Data['Flowers'], predictions_model_2)
print(f"Accuracy score of model_2: {accuracy_model_2}")


Number of missing values in Train_Data: 0

Proportion of Setosa types in Test_Data: 0.3333333333333333

Accuracy score of model_1: 1.0

Misclassified sample indices from model_1: []

Accuracy score of model_2: 1.0

Question-3: Import a dataset from http://www.ats.ucla.edu/stat/data/binary.csv. Do the

Logistic Regression to find out relation between variables that are affecting the admission of a
student in an institute based on his or her GRE score, GPA obtained and rank of the student.
Also check the model is fit or not. Apply regression Model techniques to predict the data on
above dataset


import pandas as pd
import statsmodels.api as sm

# Load the dataset

file_path = 'c:/users/zoya/PycharmProjects/pythonProject/assignques3/binary.csv'

# Load the dataset

data = pd.read_csv(file_path)
# Display the first few rows of the dataset

# Add a constant term to the predictor

data['const'] = 1

# Define the predictor variables (GRE, GPA, Rank)

X = data[['gre', 'gpa', 'rank', 'const']]

# Define the target variable (admit)

y = data['admit']

# Fit logistic regression model

model = sm.Logit(y, X)
result = model.fit()

# Display the summary of the logistic regression


# Check the model fit

print("Model Fit:")

# Make predictions
predictions = result.predict(X)

# Display the predicted probabilities

print("Predicted Probabilities:")


dmit gre gpa rank

0 0 380 3.61 3

1 1 660 3.67 3

2 1 800 4.00 1

3 1 640 3.19 4

4 0 520 2.93 4

Optimization terminated successfully.

Current function value: 0.574302

Iterations 6

Logit Regression Results


Dep. Variable: admit No. Observations: 400

Model: Logit Df Residuals: 396

Method: MLE Df Model: 3

Date: Fri, 15 Dec 2023 Pseudo R-squ.: 0.08107

Time: 11:21:30 Log-Likelihood: -229.72

converged: True LL-Null: -249.99

Covariance Type: nonrobust LLR p-value: 8.207e-09


coef std err z P>|z| [0.025 0.975]


gre 0.0023 0.001 2.101 0.036 0.000 0.004

gpa 0.7770 0.327 2.373 0.018 0.135 1.419

rank -0.5600 0.127 -4.405 0.000 -0.809 -0.311

const -3.4495 1.133 -3.045 0.002 -5.670 -1.229


Model Fit:


Predicted Probabilities:

0 0.189553

1 0.317781

2 0.717814

3 0.148949

4 0.097954


395 0.490176

396 0.184989

397 0.186814

398 0.468108

399 0.325045

Length: 400, dtype: float64

Question4: Demonstrate Decision tree classification model and evaluate the
performance of classifier on Iris Dataset


# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn import tree

# Load the dataset from the CSV file

iris_df = pd.read_csv('iris.csv')

# Separate features (X) and target variable (y)

X = iris_df.drop('Flowers', axis=1)
y = iris_df['Flowers']

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier

clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training set

clf.fit(X_train, y_train)

# Make predictions on the test set

y_pred = clf.predict(X_test)

# Evaluate the performance of the classifier

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report and confusion matrix

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Visualize the decision tree

plt.figure(figsize=(12, 8))
tree.plot_tree(clf, feature_names=X.columns, class_names=iris_df['Flowers'].unique(),


Accuracy: 1.00

Classification Report:

precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 10

Iris-versicolor 1.00 1.00 1.00 9

Iris-virginica 1.00 1.00 1.00 11

accuracy 1.00 30

macro avg 1.00 1.00 1.00 30

weighted avg 1.00 1.00 1.00 30

Confusion Matrix:

[[10 0 0]

[ 0 9 0]

[ 0 0 11]]

Question 5: Demonstrate any of the Clustering model and evaluate the performance on
Iris dataset.


# Import necessary libraries

from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score

# Load Iris dataset

iris = load_iris()
X = iris.data
y = iris.target

# Apply K-Means clustering

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)

# Visualize the clusters using PCA for dimensionality reduction

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Create a scatter plot of the clusters

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis', s=100, alpha=0.8)
plt.title('K-Means Clustering on Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

# Compare the clusters with the actual labels

ari_score = adjusted_rand_score(y, clusters)
print(f"Adjusted Rand Index (ARI): {ari_score:.2f}")


