Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
96 views

LSTM From Scratch in Python

Uploaded by

Neel24787bose
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
96 views

LSTM From Scratch in Python

Uploaded by

Neel24787bose
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

LSTM

May 6, 2024

1 LSTM From Scratch in Python


By Cristian Leo

[1]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt

# Custom classes (built from scratch)


from src.model import WeightInitializer
from src.trainer import PlotManager, EarlyStopping

[2]: class LSTM:


"""
Long Short-Term Memory (LSTM) network.

Parameters:
- input_size: int, dimensionality of input space

1
- hidden_size: int, number of LSTM units
- output_size: int, dimensionality of output space
- init_method: str, weight initialization method (default: 'xavier')
"""
def __init__(self, input_size, hidden_size, output_size,␣
↪init_method='xavier'):

self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.weight_initializer = WeightInitializer(method=init_method)

# Initialize weights
self.wf = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wi = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wo = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wc = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))

# Initialize biases
self.bf = np.zeros((hidden_size, 1))
self.bi = np.zeros((hidden_size, 1))
self.bo = np.zeros((hidden_size, 1))
self.bc = np.zeros((hidden_size, 1))

# Initialize output layer weights and biases


self.why = self.weight_initializer.initialize((output_size,␣
↪hidden_size))

self.by = np.zeros((output_size, 1))

@staticmethod
def sigmoid(z):
"""
Sigmoid activation function.

Parameters:
- z: np.ndarray, input to the activation function

Returns:
- np.ndarray, output of the activation function
"""
return 1 / (1 + np.exp(-z))

@staticmethod

2
def dsigmoid(y):
"""
Derivative of the sigmoid activation function.

Parameters:
- y: np.ndarray, output of the sigmoid activation function

Returns:
- np.ndarray, derivative of the sigmoid function
"""
return y * (1 - y)

@staticmethod
def dtanh(y):
"""
Derivative of the hyperbolic tangent activation function.

Parameters:
- y: np.ndarray, output of the hyperbolic tangent activation function

Returns:
- np.ndarray, derivative of the hyperbolic tangent function
"""
return 1 - y * y

def forward(self, x):


"""
Forward pass through the LSTM network.

Parameters:
- x: np.ndarray, input to the network

Returns:
- np.ndarray, output of the network
- list, caches containing intermediate values for backpropagation
"""
caches = []
h_prev = np.zeros((self.hidden_size, 1))
c_prev = np.zeros((self.hidden_size, 1))
h = h_prev
c = c_prev

for t in range(x.shape[0]):
x_t = x[t].reshape(-1, 1)
combined = np.vstack((h_prev, x_t))

f = self.sigmoid(np.dot(self.wf, combined) + self.bf)

3
i = self.sigmoid(np.dot(self.wi, combined) + self.bi)
o = self.sigmoid(np.dot(self.wo, combined) + self.bo)
c_ = np.tanh(np.dot(self.wc, combined) + self.bc)

c = f * c_prev + i * c_
h = o * np.tanh(c)

cache = (h_prev, c_prev, f, i, o, c_, x_t, combined, c, h)


caches.append(cache)

h_prev, c_prev = h, c

y = np.dot(self.why, h) + self.by
return y, caches

def backward(self, dy, caches, clip_value=1.0):


"""
Backward pass through the LSTM network.

Parameters:
- dy: np.ndarray, gradient of the loss with respect to the output
- caches: list, caches from the forward pass
- clip_value: float, value to clip gradients to (default: 1.0)

Returns:
- tuple, gradients of the loss with respect to the parameters
"""
dWf, dWi, dWo, dWc = [np.zeros_like(w) for w in (self.wf, self.wi, self.
↪wo, self.wc)]

dbf, dbi, dbo, dbc = [np.zeros_like(b) for b in (self.bf, self.bi, self.


↪bo, self.bc)]

dWhy = np.zeros_like(self.why)
dby = np.zeros_like(self.by)

# Ensure dy is reshaped to match output size


dy = dy.reshape(self.output_size, -1)
dh_next = np.zeros((self.hidden_size, 1)) # shape must match␣
↪hidden_size

dc_next = np.zeros_like(dh_next)

for cache in reversed(caches):


h_prev, c_prev, f, i, o, c_, x_t, combined, c, h = cache

# Add gradient from next step to current output gradient


dh = np.dot(self.why.T, dy) + dh_next
dc = dc_next + (dh * o * self.dtanh(np.tanh(c)))

4
df = dc * c_prev * self.dsigmoid(f)
di = dc * c_ * self.dsigmoid(i)
do = dh * self.dtanh(np.tanh(c))
dc_ = dc * i * self.dtanh(c_)

dcombined_f = np.dot(self.wf.T, df)


dcombined_i = np.dot(self.wi.T, di)
dcombined_o = np.dot(self.wo.T, do)
dcombined_c = np.dot(self.wc.T, dc_)

dcombined = dcombined_f + dcombined_i + dcombined_o + dcombined_c


dh_next = dcombined[:self.hidden_size]
dc_next = f * dc

dWf += np.dot(df, combined.T)


dWi += np.dot(di, combined.T)
dWo += np.dot(do, combined.T)
dWc += np.dot(dc_, combined.T)

dbf += df.sum(axis=1, keepdims=True)


dbi += di.sum(axis=1, keepdims=True)
dbo += do.sum(axis=1, keepdims=True)
dbc += dc_.sum(axis=1, keepdims=True)

dWhy += np.dot(dy, h.T)


dby += dy

gradients = (dWf, dWi, dWo, dWc, dbf, dbi, dbo, dbc, dWhy, dby)

# Gradient clipping
for i in range(len(gradients)):
np.clip(gradients[i], -clip_value, clip_value, out=gradients[i])

return gradients

def update_params(self, grads, learning_rate):


"""
Update the parameters of the network using the gradients.

Parameters:
- grads: tuple, gradients of the loss with respect to the parameters
- learning_rate: float, learning rate
"""
dWf, dWi, dWo, dWc, dbf, dbi, dbo, dbc, dWhy, dby = grads

self.wf -= learning_rate * dWf


self.wi -= learning_rate * dWi

5
self.wo -= learning_rate * dWo
self.wc -= learning_rate * dWc

self.bf -= learning_rate * dbf


self.bi -= learning_rate * dbi
self.bo -= learning_rate * dbo
self.bc -= learning_rate * dbc

self.why -= learning_rate * dWhy


self.by -= learning_rate * dby

[3]: class LSTMTrainer:


"""
Trainer for the LSTM network.

Parameters:
- model: LSTM, the LSTM network to train
- learning_rate: float, learning rate for the optimizer
- patience: int, number of epochs to wait before early stopping
- verbose: bool, whether to print training information
- delta: float, minimum change in validation loss to qualify as an␣
↪improvement

"""
def __init__(self, model, learning_rate=0.01, patience=7, verbose=True,␣
↪delta=0):

self.model = model
self.learning_rate = learning_rate
self.train_losses = []
self.val_losses = []
self.early_stopping = EarlyStopping(patience, verbose, delta)

def train(self, X_train, y_train, X_val=None, y_val=None, epochs=10,␣


↪batch_size=1, clip_value=1.0):

"""
Train the LSTM network.

Parameters:
- X_train: np.ndarray, training data
- y_train: np.ndarray, training labels
- X_val: np.ndarray, validation data
- y_val: np.ndarray, validation labels
- epochs: int, number of training epochs
- batch_size: int, size of mini-batches
- clip_value: float, value to clip gradients to
"""
for epoch in range(epochs):
epoch_losses = []

6
for i in range(0, len(X_train), batch_size):
batch_X = X_train[i:i + batch_size]
batch_y = y_train[i:i + batch_size]
losses = []

for x, y_true in zip(batch_X, batch_y):


y_pred, caches = self.model.forward(x)
loss = self.compute_loss(y_pred, y_true.reshape(-1, 1))
losses.append(loss)

# Backpropagation to get gradients


dy = y_pred - y_true.reshape(-1, 1)
grads = self.model.backward(dy, caches,␣
↪clip_value=clip_value)

self.model.update_params(grads, self.learning_rate)

batch_loss = np.mean(losses)
epoch_losses.append(batch_loss)

avg_epoch_loss = np.mean(epoch_losses)
self.train_losses.append(avg_epoch_loss)

if X_val is not None and y_val is not None:


val_loss = self.validate(X_val, y_val)
self.val_losses.append(val_loss)

if epoch % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_epoch_loss:.
↪5f}, Val Loss: {val_loss:.5f}')

# Check early stopping condition


self.early_stopping(val_loss)
if self.early_stopping.early_stop:
print("Early stopping")
break
else:
print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_epoch_loss:.
↪5f}')

def compute_loss(self, y_pred, y_true):


"""
Compute mean squared error loss.
"""
return np.mean((y_pred - y_true) ** 2)

def validate(self, X_val, y_val):

7
"""
Validate the model on a separate set of data.
"""
val_losses = []
for x, y_true in zip(X_val, y_val):
y_pred, _ = self.model.forward(x)
loss = self.compute_loss(y_pred, y_true.reshape(-1, 1))
val_losses.append(loss)
return np.mean(val_losses)

[11]: class TimeSeriesDataset:


"""
Dataset class for time series data.

Parameters:
- ticker: str, stock ticker symbol
- start_date: str, start date for data retrieval
- end_date: str, end date for data retrieval
- look_back: int, number of previous time steps to include in each sample
- train_size: float, proportion of data to use for training
"""
def __init__(self, start_date, end_date, look_back=1, train_size=0.67):
self.start_date = start_date
self.end_date = end_date
self.look_back = look_back
self.train_size = train_size

def load_data(self):
"""
Load stock data.

Returns:
- np.ndarray, training data
- np.ndarray, testing data
"""
df = pd.read_csv('data/google.csv')
df = df[(df['Date'] >= self.start_date) & (df['Date'] <= self.end_date)]
df = df.sort_index()
df = df.loc[self.start_date:self.end_date]
df = df[['Close']].astype(float) # Use closing price
df = self.MinMaxScaler(df.values) # Convert DataFrame to numpy array
train_size = int(len(df) * self.train_size)
train, test = df[0:train_size,:], df[train_size:len(df),:]
return train, test

def MinMaxScaler(self, data):


"""

8
Min-max scaling of the data.

Parameters:
- data: np.ndarray, input data
"""
numerator = data - np.min(data, 0)
denominator = np.max(data, 0) - np.min(data, 0)
return numerator / (denominator + 1e-7)

def create_dataset(self, dataset):


"""
Create the dataset for time series prediction.

Parameters:
- dataset: np.ndarray, input data

Returns:
- np.ndarray, input data
- np.ndarray, output data
"""
dataX, dataY = [], []
for i in range(len(dataset)-self.look_back):
a = dataset[i:(i + self.look_back), 0]
dataX.append(a)
dataY.append(dataset[i + self.look_back, 0])
return np.array(dataX), np.array(dataY)

def get_train_test(self):
"""
Get the training and testing data.

Returns:
- np.ndarray, training input
- np.ndarray, training output
- np.ndarray, testing input
- np.ndarray, testing output
"""
train, test = self.load_data()
trainX, trainY = self.create_dataset(train)
testX, testY = self.create_dataset(test)
return trainX, trainY, testX, testY

[12]: # Instantiate the dataset


dataset = TimeSeriesDataset('2005-01-01', '2020-12-31', train_size=0.7,␣
↪look_back=1)

trainX, trainY, testX, testY = dataset.get_train_test()

9
# Plot the data
# Combine train and test data
combined = np.concatenate((trainY, testY))

# Plot the data


plt.figure(figsize=(14, 5))
plt.plot(combined, label='Google Stock Price', linewidth=2, color='dodgerblue')
plt.title('Google Stock Price', fontsize=20)
plt.xlabel('Time', fontsize=16)
plt.ylabel('Normalized Stock Price', fontsize=16)
plt.grid(True)
plt.legend(fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

[13]: # Reshape input to be [samples, time steps, features]


trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 1))

look_back = 1 # Number of previous time steps to include in each sample


hidden_size = 256 # Number of LSTM units
output_size = 1 # Dimensionality of the output space

lstm = LSTM(input_size=1, hidden_size=hidden_size, output_size=output_size)

# Create and train the LSTM using LSTMTrainer


trainer = LSTMTrainer(lstm, learning_rate=1e-3, patience=50, verbose=True,␣
↪delta=0.001)

trainer.train(trainX, trainY, testX, testY, epochs=1000, batch_size=32)

10
Epoch 1/1000 - Loss: 0.24629, Val Loss: 0.53735
Epoch 11/1000 - Loss: 0.07889, Val Loss: 0.11416
Epoch 21/1000 - Loss: 0.06242, Val Loss: 0.05693
Epoch 31/1000 - Loss: 0.05286, Val Loss: 0.03841
Epoch 41/1000 - Loss: 0.04575, Val Loss: 0.02845
Epoch 51/1000 - Loss: 0.04046, Val Loss: 0.02222
Epoch 61/1000 - Loss: 0.03655, Val Loss: 0.01831
Epoch 71/1000 - Loss: 0.03364, Val Loss: 0.01598
Epoch 81/1000 - Loss: 0.03149, Val Loss: 0.01472
Epoch 91/1000 - Loss: 0.02989, Val Loss: 0.01420
Epoch 101/1000 - Loss: 0.02870, Val Loss: 0.01417
Epoch 111/1000 - Loss: 0.02782, Val Loss: 0.01444
Epoch 121/1000 - Loss: 0.02717, Val Loss: 0.01490
Epoch 131/1000 - Loss: 0.02668, Val Loss: 0.01546
Early stopping

[15]: plot_manager = PlotManager()

# Inside your training loop


plot_manager.plot_losses(trainer.train_losses, trainer.val_losses)

# After your training loop


plot_manager.show_plots()

11

You might also like