import torchtext.datasets
train_iter, test_iter = torchtext.datasets.IMDB(
root= '/home/jovyan/public/datasets/IMDB/' ,
split= ('train' , 'test' )
)
import itertools
import pandas as pd
data = list (iter (train_iter))
len (data)
df = pd.DataFrame(data, columns= ['sentiment' , 'review' ])
df_neg = df[df['sentiment' ] == 1 ]
df_pos = df[df['sentiment' ] == 2 ]
12500
2
Zentropa has much in common with The Third Man...
12501
2
Zentropa is the most original movie I've seen ...
12502
2
Lars Von Trier is never backward in trying out...
0
1
I rented I AM CURIOUS-YELLOW from my video sto...
1
1
"I Am Curious: Yellow" is a risible and preten...
2
1
If only to avoid making this type of film in t...
Let’s put everything into a function.
sentiment
: 0 is negative, and 1 is positive
review
: a string
def load_dataframe(iterator):
data = list (iter (iterator))
df = pd.DataFrame(data, columns= ['sentiment' , 'review' ])
df['sentiment' ] = df['sentiment' ] - 1
return df
Tokenized sequences
import torchtext.data
tokenizer = torchtext.data.get_tokenizer('basic_english' )
from tqdm.notebook import tqdm
def iterate_tokens(df):
for review in tqdm(df['review' ]):
yield tokenizer(review)
from torchtext.vocab import build_vocab_from_iterator
df = load_dataframe(train_iter)
vocab = build_vocab_from_iterator(
iterate_tokens(df),
min_freq= 5 ,
specials= ['<unk>' , '<s>' , '<eos>' ])
vocab.set_default_index(0 )
len (vocab)
Dataset Preparation
import torch
from torch.utils.data import TensorDataset, DataLoader
sequences = [
torch.tensor(vocab.lookup_indices(tokenizer(review), ), dtype= torch.int64)\
for review in df['review' ]
]
len (sequences)
from torch.nn.utils.rnn import pad_sequence
padded_sequences = pad_sequence(sequences, batch_first= True )[:, :250 ]
padded_sequences.shape
sentiments = torch.tensor(df['sentiment' ], dtype= torch.int64)
sentiments.shape
dataset = TensorDataset(padded_sequences, sentiments)
dataloader = DataLoader(dataset, shuffle= True , batch_size= 32 )
(inputs, targets) = next (iter (dataloader))
inputs.shape, targets.shape
(torch.Size([32, 250]), torch.Size([32]))
Putting it together as a function.
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import random_split
def get_dataloaders(df, vocab, max_length= 250 , batch_size= 32 ):
sequences = [
torch.tensor(
vocab.lookup_indices(tokenizer(review)),
dtype= torch.int64
) for review in df['review' ]
]
padded_sequences = pad_sequence(sequences, batch_first= True )[:, :max_length]
sentiments = torch.tensor(df['sentiment' ], dtype= torch.int64)
dataset = TensorDataset(padded_sequences, sentiments)
(train_dataset, val_dataset) = random_split(dataset, (0.7 , 0.3 ))
train_dataloader = DataLoader(train_dataset, shuffle= True , batch_size= batch_size)
val_dataloader = DataLoader(val_dataset, shuffle= True , batch_size= batch_size)
return (train_dataloader, val_dataloader)
df_train = load_dataframe(train_iter)
train_dataloader, val_dataloader = get_dataloaders(
df_train,
vocab,
max_length= 250 ,
batch_size= 32 )
End-to-end sequence classifier
import torch
import torch.nn as nn
from lightning.pytorch import LightningModule
from torchmetrics import Accuracy
class MySequenceClassifier(LightningModule):
def __init__ (self , vocab_size, dim_emb, dim_state):
super ().__init__ ()
self .embedding = nn.Embedding(vocab_size, dim_emb)
self .rnn = nn.RNN(input_size= dim_emb,
hidden_size= dim_state,
num_layers= 1 ,
batch_first= True )
self .output = nn.Linear(dim_state, 2 )
# will be monitoring accuracy
self .accuracy = Accuracy(task= 'multiclass' , num_classes= 2 )
@add_method (MySequenceClassifier)
def forward(self , sequence_batch):
emb = self .embedding(sequence_batch)
_, state = self .rnn(emb)
# state: (num_layers, batch, dim_state)
output = self .output(state[0 ])
return output
model = MySequenceClassifier(vocab_size= len (vocab), dim_emb= 16 , dim_state= 32 )
batch = next (iter (train_dataloader))
model(batch[0 ]).shape
@add_method (MySequenceClassifier)
def loss(self , outputs, targets):
return nn.functional.cross_entropy(outputs, targets)
(inputs, targets) = next (iter (train_dataloader))
outputs = model(inputs)
model.loss(outputs, targets)
tensor(0.6879, grad_fn=<NllLossBackward0>)
@add_method (MySequenceClassifier)
def training_step(self , batch, batch_index):
inputs, targets = batch
outputs = self .forward(inputs)
loss = self .loss(outputs, targets)
self .accuracy(outputs, targets)
self .log('loss' , loss, prog_bar= True )
self .log('acc' , self .accuracy, prog_bar= True )
return loss
import torch.optim
@add_method (MySequenceClassifier)
def configure_optimizers(self ):
return torch.optim.Adam(self .parameters())
@add_method (MySequenceClassifier)
def validation_step(self , batch, batch_index):
inputs, targets = batch
outputs = self .forward(inputs)
self .accuracy(outputs, targets)
self .log('val_acc' , self .accuracy, prog_bar= True )
@add_method (MySequenceClassifier)
def on_train_epoch_end(self ):
self .log('train_acc_epoch' , self .accuracy, prog_bar= True )
Training
from lightning.pytorch import Trainer
trainer = Trainer(max_epochs= 10 )
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
model = MySequenceClassifier(vocab_size= len (vocab),
dim_emb= 32 ,
dim_state= 64 )
trainer.fit(model,
train_dataloaders= train_dataloader,
val_dataloaders= val_dataloader)
| Name | Type | Params
-------------------------------------------------
0 | embedding | Embedding | 963 K
1 | rnn | RNN | 6.3 K
2 | output | Linear | 130
3 | accuracy | MulticlassAccuracy | 0
-------------------------------------------------
970 K Trainable params
0 Non-trainable params
970 K Total params
3.881 Total estimated model params size (MB)
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/torchmetrics/utilities/prints.py:43: UserWarning: The ``compute`` method of metric MulticlassAccuracy was called before the ``update`` method which may lead to errors, as metric states have not yet been updated.
warnings.warn(*args, **kwargs) # noqa: B028
`Trainer.fit` stopped: `max_epochs=10` reached.
metrics = pd.read_csv('./lightning_logs/version_1/metrics.csv' )
val_acc = metrics['val_acc' ].dropna().reset_index(drop= True ).to_frame()
val_acc.index.name = 'epochs'
val_acc
epochs
0
0.50160
1
0.51276
2
0.51644
3
0.52504
4
0.53228
5
0.54504
6
0.55036
7
0.57036
8
0.57136
9
0.58700