☐ Learning From Text Using RNN

import torchtext.datasets
train_iter, test_iter = torchtext.datasets.IMDB(
    root='/home/jovyan/public/datasets/IMDB/', 
    split=('train', 'test')
)
import itertools
import pandas as pd
data = list(iter(train_iter))
len(data)
25000
df = pd.DataFrame(data, columns=['sentiment', 'review'])
df_neg = df[df['sentiment'] == 1]
df_pos = df[df['sentiment'] == 2]
df_pos.head(3)
sentiment review
12500 2 Zentropa has much in common with The Third Man...
12501 2 Zentropa is the most original movie I've seen ...
12502 2 Lars Von Trier is never backward in trying out...
df_neg.head(3)
sentiment review
0 1 I rented I AM CURIOUS-YELLOW from my video sto...
1 1 "I Am Curious: Yellow" is a risible and preten...
2 1 If only to avoid making this type of film in t...

Let’s put everything into a function.

def load_dataframe(iterator):
    data = list(iter(iterator))
    df = pd.DataFrame(data, columns=['sentiment', 'review'])
    df['sentiment'] = df['sentiment'] - 1
    return df

1 Tokenized sequences

import torchtext.data

tokenizer = torchtext.data.get_tokenizer('basic_english')
from tqdm.notebook import tqdm

def iterate_tokens(df):
    for review in tqdm(df['review']):
        yield tokenizer(review)
from torchtext.vocab import build_vocab_from_iterator

df = load_dataframe(train_iter)

vocab = build_vocab_from_iterator(
    iterate_tokens(df),
    min_freq=5,
    specials=['<unk>', '<s>', '<eos>'])

vocab.set_default_index(0)

len(vocab)
30124

2 Dataset Preparation

import torch
from torch.utils.data import TensorDataset, DataLoader
sequences = [
    torch.tensor(vocab.lookup_indices(tokenizer(review), ), dtype=torch.int64)\
    for review in df['review']
]

len(sequences)
25000
from torch.nn.utils.rnn import pad_sequence

padded_sequences = pad_sequence(sequences, batch_first=True)[:, :250]
padded_sequences.shape
torch.Size([25000, 250])
sentiments = torch.tensor(df['sentiment'], dtype=torch.int64)
sentiments.shape
torch.Size([25000])
dataset = TensorDataset(padded_sequences, sentiments)
dataloader = DataLoader(dataset, shuffle=True, batch_size=32)
(inputs, targets) = next(iter(dataloader))
inputs.shape, targets.shape
(torch.Size([32, 250]), torch.Size([32]))

Putting it together as a function.

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import random_split

def get_dataloaders(df, vocab, max_length=250, batch_size=32):
    sequences = [
        torch.tensor(
            vocab.lookup_indices(tokenizer(review)),
            dtype=torch.int64
        ) for review in df['review']
    ]
    padded_sequences = pad_sequence(sequences, batch_first=True)[:, :max_length]
    sentiments = torch.tensor(df['sentiment'], dtype=torch.int64)
    dataset = TensorDataset(padded_sequences, sentiments)
    (train_dataset, val_dataset) = random_split(dataset, (0.7, 0.3))
    
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

    return (train_dataloader, val_dataloader)
df_train = load_dataframe(train_iter)

train_dataloader, val_dataloader = get_dataloaders(
    df_train,
    vocab,
    max_length=250,
    batch_size=32)

3 End-to-end sequence classifier

import torch
import torch.nn as nn
from lightning.pytorch import LightningModule
from torchmetrics import Accuracy

class MySequenceClassifier(LightningModule):
    def __init__(self, vocab_size, dim_emb, dim_state):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim_emb)
        self.rnn = nn.RNN(input_size=dim_emb,
                          hidden_size=dim_state,
                          num_layers=1,
                          batch_first=True)
        self.output = nn.Linear(dim_state, 2)
        
        # will be monitoring accuracy
        self.accuracy = Accuracy(task='multiclass', num_classes=2)
@add_method(MySequenceClassifier)
def forward(self, sequence_batch):
    emb = self.embedding(sequence_batch)
    _, state = self.rnn(emb)
    # state: (num_layers, batch, dim_state)
    output = self.output(state[0])
    return output
model = MySequenceClassifier(vocab_size=len(vocab), dim_emb=16, dim_state=32)
batch = next(iter(train_dataloader))
model(batch[0]).shape
torch.Size([32, 2])
@add_method(MySequenceClassifier)
def loss(self, outputs, targets):
    return nn.functional.cross_entropy(outputs, targets)
(inputs, targets) = next(iter(train_dataloader))
outputs = model(inputs)
model.loss(outputs, targets)
tensor(0.6879, grad_fn=<NllLossBackward0>)
@add_method(MySequenceClassifier)
def training_step(self, batch, batch_index):
    inputs, targets = batch
    outputs = self.forward(inputs)
    loss = self.loss(outputs, targets)
    self.accuracy(outputs, targets)
    self.log('loss', loss, prog_bar=True)
    self.log('acc', self.accuracy, prog_bar=True)
    return loss
import torch.optim

@add_method(MySequenceClassifier)
def configure_optimizers(self):
    return torch.optim.Adam(self.parameters())
@add_method(MySequenceClassifier)
def validation_step(self, batch, batch_index):
    inputs, targets = batch
    outputs = self.forward(inputs)
    self.accuracy(outputs, targets)
    self.log('val_acc', self.accuracy, prog_bar=True)
@add_method(MySequenceClassifier)
def on_train_epoch_end(self):
    self.log('train_acc_epoch', self.accuracy, prog_bar=True)

4 Training

from lightning.pytorch import Trainer

trainer = Trainer(max_epochs=10)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
model = MySequenceClassifier(vocab_size=len(vocab),
                             dim_emb=32,
                             dim_state=64)

trainer.fit(model,
            train_dataloaders=train_dataloader,
            val_dataloaders=val_dataloader)

  | Name      | Type               | Params
-------------------------------------------------
0 | embedding | Embedding          | 963 K 
1 | rnn       | RNN                | 6.3 K 
2 | output    | Linear             | 130   
3 | accuracy  | MulticlassAccuracy | 0     
-------------------------------------------------
970 K     Trainable params
0         Non-trainable params
970 K     Total params
3.881     Total estimated model params size (MB)
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/torchmetrics/utilities/prints.py:43: UserWarning: The ``compute`` method of metric MulticlassAccuracy was called before the ``update`` method which may lead to errors, as metric states have not yet been updated.
  warnings.warn(*args, **kwargs)  # noqa: B028
`Trainer.fit` stopped: `max_epochs=10` reached.
metrics = pd.read_csv('./lightning_logs/version_1/metrics.csv')
val_acc = metrics['val_acc'].dropna().reset_index(drop=True).to_frame()
val_acc.index.name = 'epochs'
val_acc
val_acc
epochs
0 0.50160
1 0.51276
2 0.51644
3 0.52504
4 0.53228
5 0.54504
6 0.55036
7 0.57036
8 0.57136
9 0.58700