Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
31 views

Assignment 2

This document discusses analyzing and modeling an alloy composition dataset using Python. It imports various libraries for data analysis and machine learning. It loads and explores the alloy composition and hardness (HV) data, examining distributions and correlations. It then fits several linear regression models to the data, including a manual calculation, Scikit-Learn implementation, and gradient descent optimization, and evaluates their performance on the training and test sets.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
31 views

Assignment 2

This document discusses analyzing and modeling an alloy composition dataset using Python. It imports various libraries for data analysis and machine learning. It loads and explores the alloy composition and hardness (HV) data, examining distributions and correlations. It then fits several linear regression models to the data, including a manual calculation, Scikit-Learn implementation, and gradient descent optimization, and evaluates their performance on the training and test sets.
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

import pandas as pd # To play with data tables

import matplotlib.pyplot as plt # To visualize data


import numpy as np
import copy

from sklearn.linear_model import LinearRegression


from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.datasets import make_regression

from sklearn.model_selection import KFold

!gdown 1i5c01hhj04J816siI‐6u3ea32vI‐HToy
Downloading...
From: https://drive.google.com/uc?id=1i5c01hhj04J816siI‐6u3ea32vI‐HToy
To: /content/alloy‐confp‐train‐data.csv
100% 7.22k/7.22k [00:00<00:00, 27.3MB/s]

data = pd.read_csv('/content/alloy‐confp‐train‐data.csv')

data = data.sample(frac=1)
data.to_csv('/content/alloy‐confp‐train‐data.csv')

data.shape
(120, 8)

Xcols = data.columns[data.columns.str.contains('C.')]
X = data[Xcols]
X

C.al C.co C.cr C.cu C.fe C.ni

102 0.074074 0.000000 0.185185 0.185185 0.185185 0.370370

22 0.250000 0.166667 0.166667 0.083333 0.166667 0.166667

105 0.062500 0.000000 0.312500 0.000000 0.468750 0.156250

106 0.142857 0.285714 0.000000 0.000000 0.285714 0.285714

117 0.264706 0.147059 0.147059 0.147059 0.147059 0.147059

... ... ... ... ... ... ...

60 0.428571 0.142857 0.071429 0.071429 0.071429 0.214286

3 0.208333 0.000000 0.208333 0.208333 0.208333 0.166667

79 0.264706 0.000000 0.147059 0.147059 0.147059 0.294118

34 0.000000 0.250000 0.000000 0.250000 0.250000 0.250000

88 0.166667 0.166667 0.333333 0.000000 0.166667 0.166667

120 rows × 6 columns

from matplotlib import pyplot as plt


X['C.al'].plot(kind='hist', bins=20, title='C.al')
plt.gca().spines[['top', 'right']].set_visible(False)
y = data['HV']

fig, (ax1,ax2,ax3,ax4) = plt.subplots(nrows=1, ncols=4, figsize=(12,3.5))

ax1.hist(y.values, bins=20)
ax2.hist(X.values[:,0], bins=20, label='c.Al')
ax3.hist(X.values[:,1], bins=20, label='c.Co')
ax4.hist(X.values[:,2], bins=20, label='c.Cr')

ax1.set_xlabel('HV', fontsize=14)
ax2.set_xlabel('c.Al', fontsize=14)
ax3.set_xlabel('c.Co', fontsize=14)
ax4.set_xlabel('c.Cr', fontsize=14)

ax1.set_ylabel('Frequency', fontsize=14)
ax2.set_ylabel('Frequency', fontsize=14)
ax3.set_ylabel('Frequency', fontsize=14)
ax4.set_ylabel('Frequency', fontsize=14)

ax1.set_ylabel('Frequency', fontsize=14)
Text(0, 0.5, 'Frequency')
# First we will define function to make plots. This will make the code simpler.

def polt_parity(y_cv_test,y_pred_test, y_cv_train=None,y_pred_train=None, label=None, ylim=[50,900]):


"""
Function to make parity plots.
"""

# Plot Parity plot


rmse_test = np.sqrt(mean_squared_error(y_cv_test,y_pred_test))
r2_test = r2_score(y_cv_test,y_pred_test)

if y_cv_train is None:
fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(5,4), sharey=True, sharex=True)
else:
fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9,4), sharey=True, sharex=True)

ax1.scatter(y_cv_test,y_pred_test)
ax1.text(0.95, 0.26, label, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.text(0.95, 0.18, "RMSE: %.2f"%rmse_test, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.text(0.95, 0.1, "R$^2$: %.2f"%r2_test, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.plot(ylim, ylim, '‐‐k')
ax1.set_xlabel('True y', fontsize=14)
ax1.set_ylabel('Pred y', fontsize=14)
ax1.set_xlim(ylim[0],ylim[1])
ax1.set_ylim(ylim[0],ylim[1])

if y_cv_train is not None:


rmse_train = np.sqrt(mean_squared_error(y_cv_train,y_pred_train))
r2_train = r2_score(y_cv_train,y_pred_train)

ax2.scatter(y_cv_train,y_pred_train, c='m')
ax2.text(0.95, 0.26, "Train", transform=ax2.transAxes, ha='right', fontsize=14)
ax2.text(0.95, 0.18, "RMSE: %.2f"%rmse_train, transform=ax2.transAxes, ha='right', fontsize=14
ax2.text(0.95, 0.1, "R2: %.2f"%r2_train, transform=ax2.transAxes, ha='right', fontsize=14)
ax2.plot(ylim, ylim, '‐‐k')

ax2.set_xlabel('True y', fontsize=14)


ax2.set_xlim(ylim[0],ylim[1])
ax2.set_ylim(ylim[0],ylim[1])

plt.tight_layout()
plt.show()

return None
X_tras_X_inv = np.linalg.inv(np.dot(X.T,X))
X_tras_y = np.dot(X.T,y)
w_cap_vec = np.dot(X_tras_X_inv,X_tras_y)
y_pred_manual = np.dot(X,w_cap_vec)

polt_parity(y,y_pred_manual, label="Train")

lr = LinearRegression(fit_intercept=False)
model = lr.fit(X,y)
lr_model = copy.deepcopy(model)

y_pred = model.predict(X)
polt_parity(y,y_pred, label="Train")
print("Sklearn model: ", lr_model.coef_)
print("Eq. based model: ", w_cap_vec)
Sklearn model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]
Eq. based model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]

y_avg_pred = [y.mean()]*len(y)
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y, y_avg_pred)))

# The coefficient of determination: 1 is perfect prediction


print("Coefficient of determination: %.2f" % r2_score(y, y_avg_pred))
Root mean squared error: 186.35
Coefficient of determination: 0.00

errors = []
rmse_avg = []
for i in range(2,12):
kf = KFold(i)
rmses = []
for idx, (train, test) in enumerate(kf.split(X)):
X_cv_train = X.values[train]
X_cv_test = X.values[test]

y_cv_train = y.values[train]
y_cv_test = y.values[test]

# Model fit and prediction


model = lr.fit(X_cv_train,y_cv_train)
y_pred_test = model.predict(X_cv_test)
y_pred_train = model.predict(X_cv_train)

# Computing errors
rmse_test = np.sqrt(mean_squared_error(y_cv_test, y_pred_test))
rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))

r2_test = r2_score(y_cv_test, y_pred_test)


r2_train = r2_score(y_cv_train, y_pred_train)

# Plot Parity plot

# polt_parity(y_cv_test,y_pred_test, y_cv_train,y_pred_train)
# print("Root mean squared error: %.2f" % rmse_test)
# print("Coefficient of determination: %.2f" % r2_test)

rmses.append(rmse_test)

rmse_avg.append(sum(rmses)/len(rmses))

rmse_avg
[90.90410023685317,
86.94150026120126,
86.89238900991143,
86.44250692124533,
87.09357006766437,
87.02679087151705,
85.82328460961962,
85.80805780460595,
86.6502684030654,
87.13891194799572]

x= range(2,12)

errors = pd.DataFrame(errors)
plt.plot(x,rmse_avg)
plt.title('Average Validation RMSE vs. Number of Folds (k)')
plt.xlabel('Number of Folds (k)')
plt.ylabel('Average Validation RMSE')
plt.grid(True)
plt.show()

X.shape[1]
6

num_iterations = 1000
learning_rate = 0.01

weights = np.zeros(X.shape[1])
bias = 0

def cost_function(weights, bias, X, y):


predictions = np.dot(X, weights) + bias
cost = np.sqrt(np.mean((predictions y) 2))
return cost

def gradient_descent(weights, bias, X, y, learning_rate, num_iterations):


for i in range(num_iterations):
predictions = np.dot(X, weights) + bias
gradient_weights = np.dot(X.T, (predictions ‐ y))
gradient_bias = np.sum(predictions ‐ y)
weights ‐= learning_rate * gradient_weights
bias ‐= learning_rate * gradient_bias
if i % 100 == 0:
print(f"Iteration {i}: Cost = {cost_function(weights, bias, X, y)}")
return weights, bias

weights, bias = gradient_descent(weights, bias, X, y, learning_rate, num_iterations)

print(f"Optimal weights: {weights}")


print(f"Optimal bias: {bias}")
Iteration 0: Cost = 255.67893485794872
Iteration 100: Cost = 87.65877256028017
Iteration 200: Cost = 82.03876320532915
Iteration 300: Cost = 81.60425558281177
Iteration 400: Cost = 81.55097669962204
Iteration 500: Cost = 81.54245710489937
Iteration 600: Cost = 81.54084599370569
Iteration 700: Cost = 81.5405029810298
Iteration 800: Cost = 81.54042383588254
Iteration 900: Cost = 81.54040461622849
Optimal weights: [1212.94760702 ‐221.99136401 270.9281074 ‐96.50510823 ‐171.89813133
‐617.39672497]
Optimal bias: 376.08439070091987

weights

array([1212.94760702, ‐221.99136401, 270.9281074 , ‐96.50510823,


‐171.89813133, ‐617.39672497])

You might also like