MLP 与 BP 算法的数学原理

Sun, 08 Dec 2024 06:10:23 +0000

关于自动微分机制的数学证明放在文末，首先给出自动微分的程序实现。因为是我笔算进行推导的，可能存在谬误。

Python version: 3.12.4
numpy version: 1.26.4
sklearn version: 1.3.0

计算图定义

计算图（computational graph）是一种被用于pytorch与tensorflow中进行自动微分以实现误差的反向传播、进而计算各参数梯度的技术，这使得我们可以方便地使用梯度更新神经网络的参数。其中，pytorch使用动态计算图设计，tensorflow使用静态计算图设计。

我们的实现中，计算图与自动微分系统被“嵌入”在了层的定义。pytorch在源码中定义了计算图基类，通过重载运算符等方法实现计算图的生成。

import numpy as np
from sklearn.datasets import make_moons


class Linear:
 def __init__(self, inputFeatures, outputFeatures, bias=True):
 self.weights = np.random.rand(inputFeatures, outputFeatures)
 self.bias = np.random.rand(outputFeatures)
 
 def __call__(self, x):
 self.input = x
 self.output = x @ self.weights
 if self.bias is not False:
 self.output += self.bias
 return self.output
 
 def paramenters(self):
 if self.bias is not False:
 return [self.output, self.bias]
 return [self.output]
 
 def backward(self, grad_output, learning_rate):
 grad_input = grad_output @ self.weights.T
 grad_weights = self.input.T @ grad_output
 grad_bias = np.sum(grad_output, axis=0) if self.bias is not None else None
 
 self.weights -= learning_rate * grad_weights
 if self.bias is not None:
 self.bias -= learning_rate * grad_bias

 return grad_input


class Sigmoid:
 def __call__(self, x):
 self.output = 1 / (1 + np.exp(-x))
 return self.output

 def backward(self, grad_output, learning_rate):
 grad_input = grad_output * self.output * (1 - self.output)
 return grad_input


class Softmax:
 def __call__(self, x):
 self.output = np.exp(x - np.max(x, axis=1, keepdims=True))
 self.output /= np.sum(self.output, axis=1, keepdims=True)
 return self.output

 # def backward(self, grad_output, learning_rate):
 # grad_input = grad_output.copy()
 # batch_size = grad_output.shape[0]

 # for i in range(batch_size):
 # y = self.output[i][:, None]
 # jacobian = np.diag(y) - np.outer(y, y)
 # grad_input[i] = jacobian @ grad_output[i]

 # return grad_input


class Sequential:
 def __init__(self, layers):
 self.layers = layers
 
 def __call__(self, x):
 for layer in self.layers:
 x = layer(x)
 self.output = x
 return self.output
 
 def predict_proba(self, x):
 logits = self(x)
 e_x = np.exp(logits - np.max(logits))
 return e_x / e_x.sum(axis=0, keepdims=True)
 
 def paramenters(self):
 return [p for layer in self.layers for p in layer.paramenters()]

基于计算图的MLP定义

class MLP:
 def __init__(self):
 self.model = Sequential([
 Linear(2, 4), Sigmoid(),
 Linear(4, 4), Sigmoid(),
 Linear(4, 2)
 ])
 self.softmax = Softmax()
 
 def __call__(self, x):
 return self.model(x)

 def forward(self, x):
 return self.model(x)

 def backwardAndGradientDescent(self, x, y, learning_rate):
 '''
 x: input darta vector, like [[0.5, -1.2], [0.7, 0.3], [-0.2, 0.8]]
 y: labels vector, like [0, 1, 0]
 '''
 batch_size = x.shape[0]
 logits = self.forward(x)

 grad_output = logits.copy()
 grad_output[range(batch_size), y] -= 1 # cross entropy gradient
 grad_output /= batch_size

 for layer in reversed(self.model.layers):
 grad_output = layer.backward(grad_output, learning_rate)
 
 def probability(self, x):
 '''
 x: input darta vector, like [[0.5, -1.2], [0.7, 0.3], [-0.2, 0.8]]
 return: probability vector, like [[0.88, 0.12], [0.45, 0.55], [0.31, 0.69]]
 '''
 batch_size = x.shape[0]
 logits = self.forward(x)
 return self.softmax(logits)

 def classify(self, x):
 return np.argmax(self.probability(x), axis=1)

训练过程

X, y = make_moons(n_samples=1000, noise=0.1)
batch_size= 50
max_steps = 50000
learning_rate = 0.05
mlp = MLP()
lossRecord = []

for step in range(max_steps):
 indices = np.arange(X.shape[0])
 np.random.shuffle(indices)
 X, y = X[indices], y[indices]

 for i in range(0, X.shape[0], batch_size):
 X_batch = X[i:i+batch_size]
 y_batch = y[i:i+batch_size]
 mlp.backwardAndGradientDescent(X_batch, y_batch, learning_rate)

 if step % 10000 == 0:
 output_batch = mlp.forward(X_batch)
 epsilon = 1e-12
 loss = -np.mean(np.log(output_batch[range(batch_size), y_batch] + epsilon))
 lossRecord.append(loss)
 print(f"Step {step}, Loss: {loss}")

打印：

神经网络 on 二三事

MLP 与 BP 算法的数学原理

计算图定义

基于计算图的MLP定义

训练过程