Assignment 2
Assignment 2
!gdown 1i5c01hhj04J816siI‐6u3ea32vI‐HToy
Downloading...
From: https://drive.google.com/uc?id=1i5c01hhj04J816siI‐6u3ea32vI‐HToy
To: /content/alloy‐confp‐train‐data.csv
100% 7.22k/7.22k [00:00<00:00, 27.3MB/s]
data = pd.read_csv('/content/alloy‐confp‐train‐data.csv')
data = data.sample(frac=1)
data.to_csv('/content/alloy‐confp‐train‐data.csv')
data.shape
(120, 8)
Xcols = data.columns[data.columns.str.contains('C.')]
X = data[Xcols]
X
ax1.hist(y.values, bins=20)
ax2.hist(X.values[:,0], bins=20, label='c.Al')
ax3.hist(X.values[:,1], bins=20, label='c.Co')
ax4.hist(X.values[:,2], bins=20, label='c.Cr')
ax1.set_xlabel('HV', fontsize=14)
ax2.set_xlabel('c.Al', fontsize=14)
ax3.set_xlabel('c.Co', fontsize=14)
ax4.set_xlabel('c.Cr', fontsize=14)
ax1.set_ylabel('Frequency', fontsize=14)
ax2.set_ylabel('Frequency', fontsize=14)
ax3.set_ylabel('Frequency', fontsize=14)
ax4.set_ylabel('Frequency', fontsize=14)
ax1.set_ylabel('Frequency', fontsize=14)
Text(0, 0.5, 'Frequency')
# First we will define function to make plots. This will make the code simpler.
if y_cv_train is None:
fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(5,4), sharey=True, sharex=True)
else:
fig, (ax1,ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9,4), sharey=True, sharex=True)
ax1.scatter(y_cv_test,y_pred_test)
ax1.text(0.95, 0.26, label, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.text(0.95, 0.18, "RMSE: %.2f"%rmse_test, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.text(0.95, 0.1, "R$^2$: %.2f"%r2_test, transform=ax1.transAxes, ha='right', fontsize=14)
ax1.plot(ylim, ylim, '‐‐k')
ax1.set_xlabel('True y', fontsize=14)
ax1.set_ylabel('Pred y', fontsize=14)
ax1.set_xlim(ylim[0],ylim[1])
ax1.set_ylim(ylim[0],ylim[1])
ax2.scatter(y_cv_train,y_pred_train, c='m')
ax2.text(0.95, 0.26, "Train", transform=ax2.transAxes, ha='right', fontsize=14)
ax2.text(0.95, 0.18, "RMSE: %.2f"%rmse_train, transform=ax2.transAxes, ha='right', fontsize=14
ax2.text(0.95, 0.1, "R2: %.2f"%r2_train, transform=ax2.transAxes, ha='right', fontsize=14)
ax2.plot(ylim, ylim, '‐‐k')
plt.tight_layout()
plt.show()
return None
X_tras_X_inv = np.linalg.inv(np.dot(X.T,X))
X_tras_y = np.dot(X.T,y)
w_cap_vec = np.dot(X_tras_X_inv,X_tras_y)
y_pred_manual = np.dot(X,w_cap_vec)
polt_parity(y,y_pred_manual, label="Train")
lr = LinearRegression(fit_intercept=False)
model = lr.fit(X,y)
lr_model = copy.deepcopy(model)
y_pred = model.predict(X)
polt_parity(y,y_pred, label="Train")
print("Sklearn model: ", lr_model.coef_)
print("Eq. based model: ", w_cap_vec)
Sklearn model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]
Eq. based model: [1589.03703891 154.02145017 647.00169133 279.68594241 204.32826373
‐241.42532589]
y_avg_pred = [y.mean()]*len(y)
print("Root mean squared error: %.2f" % np.sqrt(mean_squared_error(y, y_avg_pred)))
errors = []
rmse_avg = []
for i in range(2,12):
kf = KFold(i)
rmses = []
for idx, (train, test) in enumerate(kf.split(X)):
X_cv_train = X.values[train]
X_cv_test = X.values[test]
y_cv_train = y.values[train]
y_cv_test = y.values[test]
# Computing errors
rmse_test = np.sqrt(mean_squared_error(y_cv_test, y_pred_test))
rmse_train = np.sqrt(mean_squared_error(y_cv_train, y_pred_train))
# polt_parity(y_cv_test,y_pred_test, y_cv_train,y_pred_train)
# print("Root mean squared error: %.2f" % rmse_test)
# print("Coefficient of determination: %.2f" % r2_test)
rmses.append(rmse_test)
rmse_avg.append(sum(rmses)/len(rmses))
rmse_avg
[90.90410023685317,
86.94150026120126,
86.89238900991143,
86.44250692124533,
87.09357006766437,
87.02679087151705,
85.82328460961962,
85.80805780460595,
86.6502684030654,
87.13891194799572]
x= range(2,12)
errors = pd.DataFrame(errors)
plt.plot(x,rmse_avg)
plt.title('Average Validation RMSE vs. Number of Folds (k)')
plt.xlabel('Number of Folds (k)')
plt.ylabel('Average Validation RMSE')
plt.grid(True)
plt.show()
X.shape[1]
6
num_iterations = 1000
learning_rate = 0.01
weights = np.zeros(X.shape[1])
bias = 0
weights