import torchtext.datasets
= torchtext.datasets.IMDB(
train_iter, test_iter ='/home/jovyan/public/datasets/IMDB/',
root=('train', 'test')
split )
☐ Learning From Text Using LSTM
1 Get the dataset
import pandas as pd
def load_dataframe(iterator):
= list(iter(iterator))
data = pd.DataFrame(data, columns=['sentiment', 'review'])
df 'sentiment'] = df['sentiment'] - 1
df[return df
import torchtext.data
from torchtext.vocab import build_vocab_from_iterator
from tqdm.notebook import tqdm
= torchtext.data.get_tokenizer('basic_english')
tokenizer
def iterate_tokens(df):
for review in tqdm(df['review']):
yield tokenizer(review)
= load_dataframe(train_iter)
df
= build_vocab_from_iterator(
vocab
iterate_tokens(df),=5,
min_freq=['<unk>', '<s>', '<eos>'])
specials
0)
vocab.set_default_index(
len(vocab)
30124
2 Dataloaders
import torch
from torch.utils.data import (
TensorDataset,
DataLoader,
random_split,
)from torch.nn.utils.rnn import pad_sequence
= [
sequences =torch.int64)\
torch.tensor(vocab.lookup_indices(tokenizer(review), ), dtypefor review in df['review']
]
= pad_sequence(sequences, batch_first=True)[:, :250]
padded_sequences = torch.tensor(df['sentiment'], dtype=torch.int64)
sentiments
= TensorDataset(padded_sequences, sentiments)
dataset = random_split(dataset, (0.7, 0.3))
(train_dataset, val_dataset)
= 32
batch_size = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
train_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size) val_dataloader
3 End-to-end sequence classifier
import torch
import torch.nn as nn
from lightning.pytorch import LightningModule
from torchmetrics import Accuracy
class MySequenceClassifier(LightningModule):
def __init__(self, vocab_size, dim_emb, dim_state):
super().__init__()
self.embedding = nn.Embedding(vocab_size, dim_emb)
self.lstm = nn.LSTM(input_size=dim_emb,
=dim_state,
hidden_size=1,
num_layers=True)
batch_first
self.output = nn.Linear(dim_state, 2)
# will be monitoring accuracy
self.accuracy = Accuracy(task='multiclass', num_classes=2)
import torch.optim
#
# the rest
#
class MySequenceClassifier(MySequenceClassifier):
def forward(self, seq_batch):
= self.embedding(seq_batch)
emb = self.lstm(emb)
_, (state, _) # state: (num_layers, batch, dim_state)
= self.output(state[-1])
output return output
def loss(self, outputs, targets):
return nn.functional.cross_entropy(outputs, targets)
def training_step(self, batch, batch_index):
= batch
inputs, targets = self.forward(inputs)
outputs = self.loss(outputs, targets)
loss self.accuracy(outputs, targets)
self.log('acc', self.accuracy, prog_bar=True)
self.log('loss', loss)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters())
def validation_step(self, batch, batch_index):
= batch
inputs, targets = self.forward(inputs)
outputs self.accuracy(outputs, targets)
self.log('val_acc', self.accuracy, prog_bar=True)
4 Training
from lightning.pytorch import Trainer
from lightning.pytorch.loggers import CSVLogger
= CSVLogger('./lightning_logs/', 'lstm')
logger
= Trainer(max_epochs=10, logger=logger) trainer
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
= MySequenceClassifier(vocab_size=len(vocab),
model =32,
dim_emb=64)
dim_state
trainer.fit(model,=train_dataloader,
train_dataloaders=val_dataloader) val_dataloaders
Missing logger folder: ./lightning_logs/lstm
| Name | Type | Params
-------------------------------------------------
0 | embedding | Embedding | 963 K
1 | lstm | LSTM | 25.1 K
2 | output | Linear | 130
3 | accuracy | MulticlassAccuracy | 0
-------------------------------------------------
989 K Trainable params
0 Non-trainable params
989 K Total params
3.957 Total estimated model params size (MB)
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
`Trainer.fit` stopped: `max_epochs=10` reached.
= pd.read_csv('./lightning_logs/lstm/version_0/metrics.csv')
metrics = metrics['val_acc'].dropna().reset_index(drop=True).to_frame()
val_acc = 'epochs'
val_acc.index.name = ['LSTM_acc']
val_acc.columns val_acc
LSTM_acc | |
---|---|
epochs | |
0 | 0.50440 |
1 | 0.52992 |
2 | 0.54548 |
3 | 0.57316 |
4 | 0.61564 |
5 | 0.71092 |
6 | 0.76944 |
7 | 0.81680 |
8 | 0.84912 |
9 | 0.86832 |
5 Compare with RNN
= pd.read_csv('./lightning_logs/version_1/metrics.csv')
metrics_rnn = metrics_rnn['val_acc'].dropna().reset_index(drop=True).to_frame()
val_acc_rnn = 'epochs'
val_acc_rnn.index.name = ['SimpleRNN_acc']
val_acc_rnn.columns = val_acc_rnn.merge(val_acc, left_index=True, right_index=True)
acc ;
acc.plot.line() acc
SimpleRNN_acc | LSTM_acc | |
---|---|---|
epochs | ||
0 | 0.50160 | 0.50440 |
1 | 0.51276 | 0.52992 |
2 | 0.51644 | 0.54548 |
3 | 0.52504 | 0.57316 |
4 | 0.53228 | 0.61564 |
5 | 0.54504 | 0.71092 |
6 | 0.55036 | 0.76944 |
7 | 0.57036 | 0.81680 |
8 | 0.57136 | 0.84912 |
9 | 0.58700 | 0.86832 |