import numpy as np
import matplotlib.pyplot as pl
☑ Multi-layer Perceptron
1 Linear Separability of Binary Classification
Consider the training data:
- Inputs: \(X = \{x_i\in\mathbb{R}^d\}\)
- Labels: \(Y = \{y_i\in\{0,1\}\}\)
We say that \((X, Y)\) is linear-separable if there exists some hyperplane in \(\mathbb{R}^d\) defined by
- \(w\in\mathbb{R}^d\)
- \(b\in\mathbb{R}\)
such that:
\[\forall i,\ y_i = \mathrm{sign}(w^Tx + b)\]
#
# Linearly separable dataset
#
1)
np.random.seed(
= -45 * np.pi / 180
ang = np.array([
M 1.0, 0.0],
[0.0, 2.0]
[@ np.array([
])
[np.cos(ang), np.sin(ang)],-np.sin(ang), np.cos(ang)]
[
])= 3
L = np.random.randn(1000, 2) @ M + np.array([-L, L])
x0 = np.random.randn(1000, 2) @ M + np.array([L, -L])
x1
= pl.figure(figsize=(6,6))
figure = figure.add_subplot(1,1,1)
ax 0], x0[:, 1], s=1)
ax.scatter(x0[:, 0], x1[:, 1], s=1)
ax.scatter(x1[:,
#
# Separation
#
-8, 8], [-8, 8], '--')
ax.plot(['equal'); ax.set_aspect(
#
# Linearly inseperable dataset
#
import sklearn.datasets
= sklearn.datasets.make_circles(1000, noise=0.05, factor=0.5)
x_donuts, y_donuts = x_donuts[y_donuts == 0]
x0 = x_donuts[y_donuts == 1]
x1
= pl.figure(figsize=(6,10))
figure = pl.gca()
ax 0], x0[:, 1], s=1)
ax.scatter(x0[:, 0], x1[:, 1], s=1)
ax.scatter(x1[:, 'equal')
ax.set_aspect('Linearly Inseparable Donuts'); ax.set_title(
2 Kernel Methods
Suppose \(\{(x_i, y_i)\}\) is not linearly separable. Can we find some transformation \(\phi\) such that \(\{(\phi(x_i), y_i)\}\) becomes linearly separable?
Yes.
The function \(\phi\) is called the kernel.
def kernel(x):
= np.linalg.norm(x, axis=-1)
r return np.concatenate([x, r[:, None]], axis=-1)
5].round(3) kernel(x_donuts)[:
array([[ 0.034, -0.575, 0.576],
[-0.078, 0.552, 0.558],
[-0.819, -0.713, 1.086],
[ 0.09 , -0.518, 0.526],
[-0.452, 0.139, 0.473]])
#
# Linear separation in higher dimensions
#
= kernel(x_donuts)
x_new = x_new[y_donuts == 0]
x0_new = x_new[y_donuts == 1]
x1_new
= pl.figure()
fig = fig.add_subplot(projection='3d')
ax 15, 25)
ax.view_init(0], x0_new[:,1], x0_new[:,2], c='red')
ax.scatter(x0_new[:,0], x1_new[:,1], x1_new[:,2], c='blue')
ax.scatter(x1_new[:,'Linearly separable after kernel transformation')
ax.set_title(
= np.meshgrid(np.linspace(-1, 1, 100), np.linspace(-1, 1, 100))
xx, yy = 0 * xx + 0.8
z =0.2, color='red'); ax.plot_surface(xx, yy, z, alpha
3 Learning Nonlinear Transformation
import sklearn.datasets
import pandas as pd
'tab20c')
pl.set_cmap(= sklearn.datasets.make_moons(1000, noise=0.1)
(x_moon, y_moon) 0], x_moon[:, 1], c=y_moon, s=1); pl.scatter(x_moon[:,
We can incorporate the non-linear kernel transformation as part of the classifier model.
The model now has two layers:
- Transform input to a higher dimension:
\[z = \sigma_1(xW_1 + b_1)\] where \(W_1\in\mathbb{R}^{2\times k}\), and \(b_1\in\mathbb{R}^k\) and \(\sigma_1\) can be any non-linear activation function.
Perform classification:
\[ \mathrm{sigmoid}(zW_2 + b_2) \] where \(W_2\in\mathbb{R}^{k}\) and \(b_2\in\mathbb{R}\).
In general, the overall model architecture is given as:
\[ x:\mathrm{Input} \underbrace{\longrightarrow}_\mathrm{hidden} z:k \underbrace{\longrightarrow}_\mathrm{classify} p:c \]
This is known as the multi-linear perceptron (MLP) where
- \(k\) is the hidden dimension
- \(c\) is the number of classes
4 MLP in PyTorch
import torch
import torch.nn as nn
4.1 The model
class MLPBinaryClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim):
super().__init__()
self.hidden = nn.Linear(input_dim, hidden_dim)
self.output = nn.Linear(hidden_dim, 1)
def forward(self, x):
= self.hidden(x)
x = nn.functional.softmax(x, dim=1)
x = self.output(x)
x = torch.sigmoid(x)
x return x
4.2 Training
= torch.tensor(x_moon, dtype=torch.float32)
x_input = torch.tensor(y_moon, dtype=torch.float32).reshape(-1, 1) y_true
= MLPBinaryClassifier(2, 10)
model = torch.optim.Adam(model.parameters(), lr=0.1)
optimizer = nn.functional.binary_cross_entropy loss
= 10000
epochs for epoch in range(epochs):
optimizer.zero_grad()= loss(model(x_input), y_true)
l
l.backward()
optimizer.step()if epoch % (epochs//10) == 0:
with torch.no_grad():
print(epoch, l.numpy())
print(epoch, l.detach().numpy())
0 0.7031396
1000 0.00036938215
2000 0.00010792634
3000 4.6887973e-05
4000 2.3616216e-05
5000 1.2775891e-05
6000 7.174313e-06
7000 4.121147e-06
8000 2.390107e-06
9000 1.4174445e-06
9999 8.0992186e-07
4.3 Visualizing classification boundary
= np.linspace(-1.5, 2.5, 100)
x_lim = np.linspace(-1, 1.5, 100)
y_lim = np.meshgrid(x_lim, y_lim)
xx, yy = np.array([xx.ravel(), yy.ravel()]).T
x_input = torch.tensor(x_input, dtype=torch.float32)
x_input x_input.shape
torch.Size([10000, 2])
eval()
model.= model(x_input)
z = z.reshape(100, 100).detach().numpy()
z z.shape
(100, 100)
'tab20c')
pl.set_cmap(0], x_moon[:, 1], c=y_moon, s=1)
pl.scatter(x_moon[:, =1); pl.contour(xx, yy, z, levels
5 PyTorch Sequential Module
class MLP(nn.Module):
def __init__(self, input_dim, hidden_dim):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_dim, hidden_dim),=1),
nn.Softmax(dim1),
nn.Linear(hidden_dim,
nn.Sigmoid(),
)def forward(self, x):
return self.layers(x)
#
# train
#
= torch.tensor(x_moon, dtype=torch.float32)
x_input = torch.tensor(y_moon, dtype=torch.float32).reshape(-1, 1)
y_true
= MLP(2, 10)
model = torch.optim.Adam(model.parameters(), lr=0.1)
optimizer = nn.functional.binary_cross_entropy
loss
loss(model(x_input), y_true)
tensor(0.7080, grad_fn=<BinaryCrossEntropyBackward0>)
= 5_000
epochs for epoch in range(epochs):
optimizer.zero_grad()= loss(model(x_input), y_true)
l
l.backward()
optimizer.step()if epoch % (epochs//10) == 0:
with torch.no_grad():
print(epoch, l.numpy())
print(epoch, l.detach().numpy())
0 0.70800835
500 0.0012570171
1000 0.00041307812
1500 0.00020677462
2000 0.00012206836
2500 7.862232e-05
3000 5.33808e-05
3500 3.751982e-05
4000 2.6998929e-05
4500 1.9755698e-05
4999 1.4646525e-05
6 Visualize separation boundary for MLP
= np.linspace(-1.5, 2.5, 100)
x_lim = np.linspace(-1, 1.5, 100)
y_lim = np.meshgrid(x_lim, y_lim)
xx, yy = np.array([xx.ravel(), yy.ravel()]).T
x_input = torch.tensor(x_input, dtype=torch.float32)
x_input
x_input.shape
eval()
model.= model(x_input)
z = z.reshape(100, 100).detach().numpy()
z
'tab20c')
pl.set_cmap(0], x_moon[:, 1], c=y_moon, s=1)
pl.scatter(x_moon[:, =1); pl.contour(xx, yy, z, levels