返回

MLP 与 BP 算法的数学原理

关于自动微分机制的数学证明放在文末,首先给出自动微分的程序实现。因为是我笔算进行推导的,可能存在谬误。

Python version: 3.12.4 numpy version: 1.26.4 sklearn version: 1.3.0

计算图定义

计算图(computational graph)是一种被用于pytorchtensorflow中进行自动微分以实现误差的反向传播、进而计算各参数梯度的技术,这使得我们可以方便地使用梯度更新神经网络的参数。其中,pytorch使用动态计算图设计,tensorflow使用静态计算图设计。

我们的实现中,计算图与自动微分系统被“嵌入”在了层的定义。pytorch在源码中定义了计算图基类,通过重载运算符等方法实现计算图的生成。

import numpy as np
from sklearn.datasets import make_moons


class Linear:
    def __init__(self, inputFeatures, outputFeatures, bias=True):
        self.weights = np.random.rand(inputFeatures, outputFeatures)
        self.bias = np.random.rand(outputFeatures)
    
    def __call__(self, x):
        self.input = x
        self.output = x @ self.weights
        if self.bias is not False:
            self.output += self.bias
        return self.output
    
    def paramenters(self):
        if self.bias is not False:
            return [self.output, self.bias]
        return [self.output]
    
    def backward(self, grad_output, learning_rate):
        grad_input = grad_output @ self.weights.T
        grad_weights = self.input.T @ grad_output
        grad_bias = np.sum(grad_output, axis=0) if self.bias is not None else None
        
        self.weights -= learning_rate * grad_weights
        if self.bias is not None:
            self.bias -= learning_rate * grad_bias

        return grad_input


class Sigmoid:
    def __call__(self, x):
        self.output = 1 / (1 + np.exp(-x))
        return self.output

    def backward(self, grad_output, learning_rate):
        grad_input = grad_output * self.output * (1 - self.output)
        return grad_input


class Softmax:
    def __call__(self, x):
        self.output = np.exp(x - np.max(x, axis=1, keepdims=True))
        self.output /= np.sum(self.output, axis=1, keepdims=True)
        return self.output

    # def backward(self, grad_output, learning_rate):
    #     grad_input = grad_output.copy()
    #     batch_size = grad_output.shape[0]

    #     for i in range(batch_size):
    #         y = self.output[i][:, None]
    #         jacobian = np.diag(y) - np.outer(y, y)
    #         grad_input[i] = jacobian @ grad_output[i]

    #     return grad_input


class Sequential:
    def __init__(self, layers):
        self.layers = layers
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.output = x
        return self.output
    
    def predict_proba(self, x):
        logits = self(x)
        e_x = np.exp(logits - np.max(logits))
        return e_x / e_x.sum(axis=0, keepdims=True)
    
    def paramenters(self):
        return [p for layer in self.layers for p in layer.paramenters()]

基于计算图的MLP定义

class MLP:
    def __init__(self):
        self.model = Sequential([
            Linear(2, 4), Sigmoid(),
            Linear(4, 4), Sigmoid(),
            Linear(4, 2)
        ])
        self.softmax = Softmax()
    
    def __call__(self, x):
        return self.model(x)

    def forward(self, x):
        return self.model(x)

    def backwardAndGradientDescent(self, x, y, learning_rate):
        '''
        x: input darta vector, like [[0.5, -1.2], [0.7, 0.3], [-0.2, 0.8]]
        y: labels vector, like [0, 1, 0]
        '''
        batch_size = x.shape[0]
        logits = self.forward(x)

        grad_output = logits.copy()
        grad_output[range(batch_size), y] -= 1  # cross entropy gradient
        grad_output /= batch_size

        for layer in reversed(self.model.layers):
            grad_output = layer.backward(grad_output, learning_rate)
    
    def probability(self, x):
        '''
        x: input darta vector, like [[0.5, -1.2], [0.7, 0.3], [-0.2, 0.8]]
        return: probability vector, like [[0.88, 0.12], [0.45, 0.55],  [0.31, 0.69]]
        '''
        batch_size = x.shape[0]
        logits = self.forward(x)
        return self.softmax(logits)

    def classify(self, x):
        return np.argmax(self.probability(x), axis=1)

训练过程

X, y = make_moons(n_samples=1000, noise=0.1)
batch_size= 50
max_steps = 50000
learning_rate = 0.05
mlp = MLP()
lossRecord = []

for step in range(max_steps):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]

    for i in range(0, X.shape[0], batch_size):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]
        mlp.backwardAndGradientDescent(X_batch, y_batch, learning_rate)

    if step % 10000 == 0:
        output_batch = mlp.forward(X_batch)
        epsilon = 1e-12
        loss = -np.mean(np.log(output_batch[range(batch_size), y_batch] + epsilon))
        lossRecord.append(loss)
        print(f"Step {step}, Loss: {loss}")

打印:

Step 0, Loss: 0.6577937985199145 Step 10000, Loss: 0.00873273601484354 Step 20000, Loss: 0.00953517332872011 Step 30000, Loss: 0.0031810759662180698 Step 40000, Loss: 2.1824599140328784e-05 Step 50000, Loss: 0.00025276002897095404

import matplotlib.pyplot as plt

steps = np.arange(0, max_steps, 1000)
plt.rcParams.update({'font.family': 'serif', 'font.serif': ['Times New Roman'], 'axes.titlesize': 14, 'axes.labelsize': 12, 'xtick.labelsize': 10, 'ytick.labelsize': 10, 'lines.linewidth': 2, 'lines.markersize': 5, 'figure.figsize': (8, 6), 'axes.grid': True, 'grid.alpha': 0.5, 'grid.linestyle': '--', 'grid.color': 'gray'})
plt.plot(steps, lossRecord, color='#9b95c9', label='Training Loss')
plt.title('Training Loss Curve', fontsize=16)
plt.xlabel('Steps', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.legend(loc='upper right', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
# plt.savefig("training_loss_curve.svg", format="svg")
plt.show()

training loss curve

超平面可视化

可视化的原始代码来自唐亘先生的GitHub项目,在他的视频徒手实现多层感知器–人工智能的创世纪中对此有作演示。在他的可视化代码基础上,我做了若干更改,使数据图漂亮不少——至少,从我审美看的话。

def draw_data(data):
    '''
    将数据可视化
    '''
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(1, 1, 1)
    x, y = data
    label1 = x[y > 0]
    ax.scatter(label1[:, 0], label1[:, 1], marker='o', color='royalblue', label='Class 1', alpha=0.7)
    label0 = x[y == 0]
    ax.scatter(label0[:, 0], label0[:, 1], marker='^', color='darkorange', label='Class 0', alpha=0.7)
    ax.set_title('Data Points Visualization', fontsize=16)
    ax.set_xlabel('Feature 1', fontsize=14)
    ax.set_ylabel('Feature 2', fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.legend(loc='upper right', fontsize=12)
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    return ax

def draw_model(ax, model):
    '''
    将模型的分离超平面可视化
    '''
    x1 = np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 100)
    x2 = np.linspace(ax.get_ylim()[0], ax.get_ylim()[1], 100)
    x1, x2 = np.meshgrid(x1, x2)
    y = model.probability(np.c_[x1.ravel(), x2.ravel()])[:, 1]
    y = y.reshape(x1.shape)
    contour = ax.contourf(x1, x2, y, levels=[0, 0.5], colors=['#9B95C9'], alpha=0.15)
    ax.contour(x1, x2, y, levels=[0.5], colors='#7B68EE', linewidths=2, linestyles='solid')
    ax.set_title('Decision Boundary', fontsize=16)
    ax.set_xlabel('Feature 1', fontsize=14)
    ax.set_ylabel('Feature 2', fontsize=14)
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.5)
    return ax
draw_data([X, y])
plt.tight_layout()
# plt.savefig("data_distribution.svg", format="svg")
plt.show()

data distribution

draw_model(draw_data([X, y]), mlp)
plt.tight_layout()
# plt.savefig("hyperplane.svg", format="svg")
plt.show()

hyperplane

自动微分机制的数学证明

在一个MLP中,假设batch_size = 1,以分子布局表示向量微分,记 $\boldsymbol{z}\in\mathbb{R}^{1\times n}$、$\boldsymbol{z}',\boldsymbol{b}\in\mathbb{R}^{1\times m}$、$\boldsymbol{W}\in\mathbb{R}^{n\times m}$,

$$ \boldsymbol{z}'=\sigma\big(\boldsymbol{zW}+\boldsymbol{b}\big)\tag{1} $$

也可以写成标量形式,即

$$ z'_i=\sigma\Big(\sum^n_{j=1}w_{ji}z_j+b_i\Big),\ \ \ \ i=1,2\cdots,m\tag{2} $$

设中间变量 $\boldsymbol{a}=\boldsymbol{Wz}+\boldsymbol{b}$,$\boldsymbol{a}\in\mathbb{R}^{1\times m}$,$\boldsymbol{a}$ 表示数据经线性变换后、经非线性映射(激活函数)前的内容,有公式 $\boldsymbol{z}'=\sigma(\boldsymbol{a})$,与

$$ a_i=\sum^n_{j=1}w_{ji}z_j+b_i,\ \ \ \ i=1,2\cdots,m\tag{3} $$

根据链式法则,当梯度传播至 $\boldsymbol{z}'$,即已知 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}'}\big|_{\boldsymbol{\theta}}$,其中 $\mathcal{L}$ 为损失函数,则

$$ \frac{\partial\mathcal{L}}{\partial a_i}=\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}'_i}\frac{\partial{z'_i}}{\partial{a_i}},\ \ \ \ i=1,2\cdots,m\tag{4} $$

写作向量形式,即

$$ \boxed{\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}=\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}'}\frac{\partial\boldsymbol{z}'}{\partial\boldsymbol{a}}=\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}'}\odot\,\textbf{grad}\big(\sigma(\boldsymbol{a})\big)=\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}'}\odot\sigma'(\boldsymbol{a})}\tag{5} $$

所以激活函数$\sigma$在反向传播时,应将 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}$ 传播至低一层。对于不含参数的激活函数,并不需要进行参数的更新,其任务只是传递梯度 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}$。其中$\odot$表示Hadamard积,在numpy中对应操作符 *

现在逐个讨论 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{W}}$、$\frac{\partial\mathcal{L}}{\partial\boldsymbol{b}}$ 与 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}}$。对于$\frac{\partial\mathcal{L}}{\partial\boldsymbol{W}}$,$i,j\in[1,m]\times[1,n]$,有

$$ \frac{\partial\mathcal{L}}{\partial w_{ji}}=\sum^m_{k=1}\frac{\partial\mathcal{L}}{\partial a_k}\frac{\partial a_k}{\partial w_{ji}}=\sum^m_{k=1}\frac{\partial\mathcal{L}}{\partial a_k}\frac{\partial}{\partial w_{ji}}\left(\sum^n_{l=1}w_{lk}\,z_l+b_k\right)=\frac{\partial\mathcal{L}}{\partial a_i}\cdot z_j\tag{6} $$

所以

$$ \boxed{\begin{aligned}\\&\ \ \ \ \frac{\partial\mathcal{L}}{\partial\boldsymbol{W}}=\left(\begin{matrix}z_1\frac{\partial\mathcal{L}}{\partial a_1}&z_2\frac{\partial\mathcal{L}}{\partial a_1}&\cdots&z_n\frac{\partial\mathcal{L}}{\partial a_1}\\z_1\frac{\partial\mathcal{L}}{\partial a_2}&z_2\frac{\partial\mathcal{L}}{\partial a_2}&\cdots&z_n\frac{\partial\mathcal{L}}{\partial a_2}\\\vdots&\vdots&&\vdots\\z_1\frac{\partial\mathcal{L}}{\partial a_m}&z_2\frac{\partial\mathcal{L}}{\partial a_m}&\cdots&z_n\frac{\partial\mathcal{L}}{\partial a_m}\\\end{matrix}\right)=\boldsymbol{z}^T\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}\ \ \ \ \\&\end{aligned}}\tag{7} $$

接着讨论$\frac{\partial\mathcal{L}}{\partial\boldsymbol{b}}$,易见

$$ \frac{\partial\mathcal{L}}{\partial b_i}=\frac{\partial\mathcal{L}}{\partial a_i}\frac{\partial a_i}{\partial{b_i}}=\frac{\partial\mathcal{L}}{\partial a_i}\times1,\ \ \ \ i=1,2\cdots,m\tag{8} $$

显然

$$ \boxed{\frac{\partial\mathcal{L}}{\partial\boldsymbol{b}}=\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}}\tag{9} $$

batch_size大于$1$时,$\frac{\partial\mathcal{L}}{\partial\boldsymbol{b}}$ 应取np.sum(grad_output, axis=0)而非grad_output。这一步对其他梯度计算而言已经蕴含在矩阵 / 张量乘法中了,但对偏置需要单独实现。

最后讨论 $\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}}$,同理

$$ \begin{align*}\frac{\partial\mathcal{L}}{\partial z_i}&=\sum^m_{j=1}\frac{\partial\mathcal{L}}{\partial a_j}\frac{\partial a_j}{\partial z_i}\tag{10}\\&=\sum^m_{j=1}\frac{\partial\mathcal{L}}{\partial a_j}\frac{\partial}{\partial z_i}\left(\sum^n_{k=1}w_{kj}z_k+b_j\right)\tag{11}\\&=\sum^m_{j=1}w_{ij}\frac{\partial\mathcal{L}}{\partial a_j},\ \ \ \ i=1,2\cdots,n\tag{12}\end{align*} $$

因此

$$ \boxed{\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}}=\frac{\partial\mathcal{L}}{\partial\boldsymbol{a}}\boldsymbol{W}^T}\tag{13} $$

其中$\frac{\partial\mathcal{L}}{\partial\boldsymbol{W}}$与$\frac{\partial\mathcal{L}}{\partial\boldsymbol{b}}$通过梯度下降及其变种算法参与参数更新,$\frac{\partial\mathcal{L}}{\partial\boldsymbol{z}}$则传递给低一层的网络或激活函数。如果要更新网络参数,前向传播时各层应保存输入,以便在反向传播时计算梯度。

代码中的反向传播算法实际上正是该推理的程序实现。

gradient descent

补充:关于批归一化技术与层归一化技术,偶然发现有一篇博客总结得十分不错,和我的见解高度一致,在此引用,

“朴素贝叶斯”的博客园文章:深度学习中的逐层归一化