import matplotlib.pyplot as pyplot
import pandas as pd
import numpy as np
☑ Evaluation
1 Overfitting
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
1.1 Noisy data
To illustrate the danger of wielding the computational power freely to learn from data, let’s consider the following linear but noisy dataset.
\[ y = 2x + 1 + \mathrm{noise} \]
0)
torch.manual_seed(= torch.linspace(0, 1, 20)
xs = 2 * xs + 1 + 0.3 * torch.randn(20); target
'--o'); pyplot.plot(xs, target,
1.2 Learning from noisy data
class MLP(nn.Module):
def __init__(self, hidden_dim, num_hidden_layers):
super().__init__()
self.layer1 = nn.Linear(1, hidden_dim)
self.layers = nn.ModuleList([
nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.Sigmoid(),
)
])self.layer_last = nn.Linear(hidden_dim, 1)
def forward(
self,
# (batch,)
x,
):= x[:, None] # (batch, 1)
x = self.layer1(x) # (batch, hidden)
x for layer in self.layers:
= layer(x)
x = self.layer_last(x) # (batch, 1)
x = x[:, 0] # (batch,)
x return x
= MLP(2, 1)
simple_model = optim.Adam(simple_model.parameters())
optimizer = 5000
num_epochs
from tqdm.notebook import trange
with trange(num_epochs) as progress:
for epoch in progress:
= F.mse_loss(simple_model(xs), target)
loss
optimizer.zero_grad()
loss.backward()
optimizer.step()= loss.detach().numpy()
loss 'loss:{:.2f}'.format(loss)) progress.display(
with torch.no_grad():
= simple_model(xs)
output = F.mse_loss(output, target)
loss loss
tensor(0.0642)
'o', xs, output); pyplot.plot(xs, target,
The simple_model
has achieved good fit of the noisy data. Numerically, the loss function is 0.06.
1.3 Large models
= MLP(hidden_dim=1024, num_hidden_layers=8)
large_model
= optim.Adam(large_model.parameters())
optimizer = 5000
num_epochs
from tqdm.notebook import trange
with trange(num_epochs) as progress:
for epoch in progress:
= F.mse_loss(large_model(xs), target)
loss
optimizer.zero_grad()
loss.backward()
optimizer.step()= loss.detach().numpy()
loss 'loss:{:.2f}'.format(loss)) progress.display(
with torch.no_grad():
= large_model(xs)
output = F.mse_loss(output, target)
loss 'o', xs, output)
pyplot.plot(xs, target, loss
tensor(0.0016)
The large_model
achieved a “better” loss value of 0.0016, but objectively, we would say that large_model
has achieved a worse fit as it no longer follows the trend in the data, but rather following the noise of the data.
How do we justify that large_model
is worse than the simple_model
?
2 Validation
Validation is a technique to assess the quality of a model by holding out a portion of the available samples for evaluation. This is known as the validation dataset. In most cases, validation datasets are not used for training.
\[ D = D_\mathrm{train} \cup D_\mathrm{val} \]
0)
torch.manual_seed(= torch.linspace(0, 1, 20)
xs_train = 2 * xs_train + 1 + 0.3 * torch.randn(20);
target_train
= torch.linspace(1, 1.8, 15)
xs_val = 2 * xs_val + 1 + 0.3 * torch.randn(15); target_val
'--o', xs_val, target_val, 'o'); pyplot.plot(xs_train, target_train,
2.1 Valuation loss
The validation loss is defined as:
\[ L_\mathrm{val}(\theta, D_\mathrm{val}) \]
= {
models 'simple_model': simple_model,
'large_model': large_model
}
= {
losses 'val_loss': [],
'train_loss': [],
}for (name, model) in models.items():
with torch.no_grad():
= model(xs_val)
output 'val_loss'].append(F.mse_loss(output, target_val).item())
losses[
= model(xs_train)
output 'train_loss'].append(F.mse_loss(output, target_train).item())
losses[
= pd.DataFrame(losses, index=models.keys())
df df
val_loss | train_loss | |
---|---|---|
simple_model | 0.655414 | 0.064218 |
large_model | 8.085220 | 0.001590 |
2.2 Detecting overfitting with validation
We are going to focus on the large model, but this time, we will perform validation at the end of each epoch.
= MLP(hidden_dim=1024, num_hidden_layers=8)
large_model
= optim.Adam(large_model.parameters())
optimizer = 5000
num_epochs
= {
losses 'val_loss': [],
'train_loss': [],
}
with trange(num_epochs) as progress:
for epoch in progress:
= F.mse_loss(large_model(xs_train), target_train)
train_loss
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
# validation
with torch.no_grad():
= F.mse_loss(large_model(xs_val), target_val)
val_loss
= train_loss.detach().item()
train_loss = val_loss.item()
val_loss
'train_loss'].append(train_loss)
losses['val_loss'].append(val_loss)
losses[
progress.display('train_loss=%.4f, val_loss=%.4f' % (train_loss, val_loss))
= pd.DataFrame(losses)
df df.head()
val_loss | train_loss | |
---|---|---|
0 | 0.883441 | 5.854383 |
1 | 0.977545 | 0.491185 |
2 | 1.484757 | 3.012334 |
3 | 0.676571 | 3.106008 |
4 | 0.099790 | 1.453448 |
pyplot.plot(df.index, np.log(df.train_loss))
pyplot.plot(df.index, np.log(df.val_loss))'train loss', 'validation loss']); pyplot.legend([