Skip to content

10.4 深度学习框架:PyTorch 与 TensorFlow

从机器学习到深度学习

在上一节,我们学习了 Scikit-Learn 中的经典机器学习算法。现在,让我们迈向更强大的深度学习世界。

💡 核心问题:为什么需要深度学习?

传统机器学习:需要人工设计特征(feature engineering)

图像 → 手工提取边缘、纹理等特征 → 分类器 → 预测

深度学习:自动学习特征表示

图像 → 神经网络(自动学习层次化特征)→ 预测

深度学习在以下任务中表现卓越:

  • 计算机视觉:图像分类、目标检测、图像生成
  • 自然语言处理:机器翻译、文本生成、情感分析
  • 语音识别:语音转文字、语音合成
  • 强化学习:游戏 AI、机器人控制

PyTorch vs TensorFlow:两大框架对比

特性PyTorchTensorFlow
开发者Meta (Facebook)Google
计算图动态图(Define-by-Run)2.x 默认动态图(Eager Execution)
易用性更 Pythonic,易于调试Keras API 简化了使用
生态系统学术界主流,Hugging Face工业界广泛应用,TensorFlow Serving
移动部署TorchScript, PyTorch MobileTensorFlow Lite
性能优秀优秀

📌 本章重点:我们将深入学习 PyTorch,因为它在研究界和 AI Agent 开发中更受欢迎。最后会简要介绍 TensorFlow/Keras。


PyTorch 基础:张量操作

1. 张量(Tensor):深度学习的基本单元

PyTorch 的 torch.Tensor 和 NumPy 的 ndarray 非常相似,但有两个关键优势:

  • GPU 加速:可以在 GPU 上进行计算
  • 自动微分:可以自动计算梯度
python
import torch
import numpy as np

# 创建张量的多种方式
# 1. 从 Python 列表创建
x = torch.tensor([1, 2, 3, 4, 5])
print(f"x: {x}, dtype: {x.dtype}")

# 2. 从 NumPy 数组创建
np_array = np.array([1.0, 2.0, 3.0])
tensor_from_numpy = torch.from_numpy(np_array)
print(f"From NumPy: {tensor_from_numpy}")

# 3. 创建特殊张量
zeros = torch.zeros(3, 4)  # 3x4 零矩阵
ones = torch.ones(2, 3)    # 2x3 全 1 矩阵
rand = torch.rand(2, 3)    # 2x3 随机矩阵 [0, 1)
randn = torch.randn(2, 3)  # 2x3 标准正态分布
eye = torch.eye(5)         # 5x5 单位矩阵

print(f"zeros:\n{zeros}")
print(f"randn:\n{randn}")

# 4. 指定数据类型
x_float = torch.tensor([1, 2, 3], dtype=torch.float32)
x_double = torch.tensor([1, 2, 3], dtype=torch.float64)
x_int = torch.tensor([1, 2, 3], dtype=torch.int32)

print(f"float32: {x_float.dtype}, float64: {x_double.dtype}, int32: {x_int.dtype}")

# 5. 张量的形状操作
x = torch.randn(2, 3, 4)  # 2x3x4 张量
print(f"原始形状: {x.shape}")

# 重塑
x_reshaped = x.view(2, 12)  # 变为 2x12
print(f"重塑后: {x_reshaped.shape}")

# 自动推断维度
x_reshaped2 = x.view(2, -1)  # -1 表示自动计算
print(f"自动推断: {x_reshaped2.shape}")

# 转置
x_2d = torch.randn(3, 4)
x_transposed = x_2d.t()  # 转置
print(f"转置前: {x_2d.shape}, 转置后: {x_transposed.shape}")

# 多维转置
x_3d = torch.randn(2, 3, 4)
x_permuted = x_3d.permute(2, 0, 1)  # (2,3,4) → (4,2,3)
print(f"permute 后: {x_permuted.shape}")

2. 张量运算

python
# 基本运算
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])

# 逐元素运算
print(f"加法: {a + b}")
print(f"减法: {a - b}")
print(f"乘法: {a * b}")
print(f"除法: {a / b}")
print(f"幂运算: {a ** 2}")

# 矩阵运算
A = torch.randn(3, 4)
B = torch.randn(4, 5)

# 矩阵乘法
C = torch.mm(A, B)  # 或者 A @ B
print(f"矩阵乘法: {A.shape} @ {B.shape} = {C.shape}")

# 批量矩阵乘法
batch_A = torch.randn(10, 3, 4)  # 10 个 3x4 矩阵
batch_B = torch.randn(10, 4, 5)  # 10 个 4x5 矩阵
batch_C = torch.bmm(batch_A, batch_B)  # 10 个 3x5 矩阵
print(f"批量矩阵乘法: {batch_C.shape}")

# 统计运算
x = torch.randn(3, 4)
print(f"求和: {x.sum()}")
print(f"均值: {x.mean()}")
print(f"最大值: {x.max()}")
print(f"沿轴求和: {x.sum(dim=0).shape}")  # 沿第 0 维
print(f"沿轴求和: {x.sum(dim=1).shape}")  # 沿第 1 维

3. GPU 加速

python
# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 将张量移动到 GPU
x_cpu = torch.randn(1000, 1000)
x_gpu = x_cpu.to(device)  # 或者 x_cpu.cuda()

# 在 GPU 上计算
y_gpu = x_gpu @ x_gpu.t()

# 移回 CPU
y_cpu = y_gpu.to('cpu')  # 或者 y_gpu.cpu()

# 性能对比
import time

# CPU 计算
x_cpu = torch.randn(5000, 5000)
start = time.time()
y_cpu = x_cpu @ x_cpu.t()
cpu_time = time.time() - start

# GPU 计算(如果可用)
if torch.cuda.is_available():
    x_gpu = x_cpu.to('cuda')
    torch.cuda.synchronize()  # 等待 GPU 完成
    start = time.time()
    y_gpu = x_gpu @ x_gpu.t()
    torch.cuda.synchronize()
    gpu_time = time.time() - start

    print(f"CPU 时间: {cpu_time:.4f}s")
    print(f"GPU 时间: {gpu_time:.4f}s")
    print(f"加速比: {cpu_time / gpu_time:.2f}x")
else:
    print("GPU 不可用")

自动微分(Autograd):深度学习的核心

深度学习的本质是通过反向传播算法计算梯度并更新参数。PyTorch 的 autograd 模块可以自动计算梯度。

1. 基础概念

python
# 创建需要梯度的张量
x = torch.tensor([2.0, 3.0], requires_grad=True)

# 定义计算
y = x ** 2 + 3 * x + 1

# 计算梯度
# dy/dx = 2x + 3
# 在 x=[2,3] 处,dy/dx = [7, 9]
loss = y.sum()  # 标量损失(反向传播需要标量)
loss.backward()  # 计算梯度

print(f"x: {x}")
print(f"y: {y}")
print(f"x.grad: {x.grad}")  # dy/dx

2. 梯度累积

python
x = torch.tensor([1.0, 2.0], requires_grad=True)

# 第一次计算
y = x ** 2
y.sum().backward()
print(f"第一次梯度: {x.grad}")

# 第二次计算(梯度会累积!)
y = x ** 3
y.sum().backward()
print(f"累积后梯度: {x.grad}")

# 清零梯度
x.grad.zero_()
y = x ** 3
y.sum().backward()
print(f"清零后梯度: {x.grad}")

3. 停止梯度追踪

python
x = torch.tensor([1.0, 2.0], requires_grad=True)

# 方法 1:torch.no_grad()
with torch.no_grad():
    y = x * 2
    print(f"y.requires_grad: {y.requires_grad}")  # False

# 方法 2:detach()
y = x * 2
y_detached = y.detach()
print(f"y_detached.requires_grad: {y_detached.requires_grad}")  # False

4. 手动实现梯度下降

python
# 目标:找到 y = (x - 3)^2 的最小值

x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1

for epoch in range(100):
    # 前向传播
    y = (x - 3) ** 2

    # 反向传播
    y.backward()

    # 更新参数(注意:不能在计算图中更新)
    with torch.no_grad():
        x -= learning_rate * x.grad

    # 清零梯度
    x.grad.zero_()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}: x = {x.item():.4f}, y = {y.item():.4f}")

print(f"\n最优解: x = {x.item():.4f} (理论值: 3.0)")

构建神经网络:nn.Module

1. 第一个神经网络

python
import torch.nn as nn
import torch.nn.functional as F

class SimpleNet(nn.Module):
    """
    简单的全连接神经网络

    架构:
    输入层 (784) → 隐藏层 (128) → ReLU → 输出层 (10) → Softmax
    """

    def __init__(self, input_dim: int = 784, hidden_dim: int = 128, output_dim: int = 10):
        super(SimpleNet, self).__init__()

        # 定义层
        self.fc1 = nn.Linear(input_dim, hidden_dim)  # 全连接层 1
        self.fc2 = nn.Linear(hidden_dim, output_dim)  # 全连接层 2

    def forward(self, x):
        """前向传播"""
        # x: (batch_size, input_dim)

        # 隐藏层
        x = self.fc1(x)  # (batch_size, hidden_dim)
        x = F.relu(x)    # 激活函数

        # 输出层
        x = self.fc2(x)  # (batch_size, output_dim)

        return x

# 创建模型
model = SimpleNet(input_dim=784, hidden_dim=128, output_dim=10)
print(model)

# 查看参数
print("\n模型参数:")
for name, param in model.named_parameters():
    print(f"{name:15s}: {param.shape}")

# 统计参数数量
total_params = sum(p.numel() for p in model.parameters())
print(f"\n总参数数: {total_params:,}")

# 前向传播
x = torch.randn(32, 784)  # 批量大小 32
output = model(x)
print(f"\n输入形状: {x.shape}")
print(f"输出形状: {output.shape}")

2. 常用层

python
# 1. 全连接层(Linear)
fc = nn.Linear(in_features=100, out_features=50)

# 2. 卷积层(Conv2d)
conv = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)

# 3. 池化层(MaxPool2d)
pool = nn.MaxPool2d(kernel_size=2, stride=2)

# 4. Dropout(防止过拟合)
dropout = nn.Dropout(p=0.5)

# 5. BatchNorm(批量归一化)
bn = nn.BatchNorm1d(num_features=100)

# 6. 激活函数
relu = nn.ReLU()
sigmoid = nn.Sigmoid()
tanh = nn.Tanh()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)

# 7. 循环层
rnn = nn.RNN(input_size=100, hidden_size=50, num_layers=2)
lstm = nn.LSTM(input_size=100, hidden_size=50, num_layers=2)
gru = nn.GRU(input_size=100, hidden_size=50, num_layers=2)

3. 更复杂的网络

python
class DeepNet(nn.Module):
    """
    深层神经网络,带有 Dropout 和 BatchNorm
    """

    def __init__(self):
        super(DeepNet, self).__init__()

        self.fc1 = nn.Linear(784, 512)
        self.bn1 = nn.BatchNorm1d(512)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)

        self.fc4 = nn.Linear(128, 10)

        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # 层 1
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout(x)

        # 层 2
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout(x)

        # 层 3
        x = self.fc3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.dropout(x)

        # 输出层
        x = self.fc4(x)

        return x

model = DeepNet()
print(model)

4. 使用 nn.Sequential

python
# 方法 1:顺序定义
model = nn.Sequential(
    nn.Linear(784, 512),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, 10)
)

# 方法 2:有序字典
from collections import OrderedDict

model = nn.Sequential(OrderedDict([
    ('fc1', nn.Linear(784, 512)),
    ('relu1', nn.ReLU()),
    ('dropout1', nn.Dropout(0.5)),
    ('fc2', nn.Linear(512, 256)),
    ('relu2', nn.ReLU()),
    ('dropout2', nn.Dropout(0.5)),
    ('fc3', nn.Linear(256, 10))
]))

print(model)

训练循环:完整的训练流程

1. 数据加载

python
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms

# 定义数据转换
transform = transforms.Compose([
    transforms.ToTensor(),  # 转换为张量
    transforms.Normalize((0.5,), (0.5,))  # 归一化
])

# 加载 MNIST 数据集
train_dataset = datasets.MNIST(
    root='./data',
    train=True,
    transform=transform,
    download=True
)

test_dataset = datasets.MNIST(
    root='./data',
    train=False,
    transform=transform,
    download=True
)

# 创建 DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=2
)

test_loader = DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=2
)

print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 查看一个批次
images, labels = next(iter(train_loader))
print(f"批次图像形状: {images.shape}")  # (64, 1, 28, 28)
print(f"批次标签形状: {labels.shape}")  # (64,)

2. 完整的训练脚本

python
import torch
import torch.nn as nn
import torch.optim as optim
from typing import Tuple

class MNISTNet(nn.Module):
    """MNIST 分类网络"""

    def __init__(self):
        super(MNISTNet, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = x.view(-1, 28 * 28)  # 展平
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

def train_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    device: torch.device
) -> Tuple[float, float]:
    """训练一个 epoch"""
    model.train()  # 设置为训练模式

    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in dataloader:
        # 移动到设备
        images, labels = images.to(device), labels.to(device)

        # 清零梯度
        optimizer.zero_grad()

        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)

        # 反向传播
        loss.backward()

        # 更新参数
        optimizer.step()

        # 统计
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100.0 * correct / total

    return epoch_loss, epoch_acc

def evaluate(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: nn.Module,
    device: torch.device
) -> Tuple[float, float]:
    """评估模型"""
    model.eval()  # 设置为评估模式

    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():  # 不计算梯度
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100.0 * correct / total

    return epoch_loss, epoch_acc

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 创建模型
model = MNISTNet().to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练
num_epochs = 10
best_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
    print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")

    # 保存最佳模型
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"  → 保存最佳模型(准确率: {best_acc:.2f}%)")

print(f"\n训练完成!最佳测试准确率: {best_acc:.2f}%")

3. 学习率调度

python
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR

# 方法 1:每 N 个 epoch 降低学习率
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# 方法 2:当指标不再改善时降低学习率
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

# 方法 3:余弦退火
scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)

# 在训练循环中使用
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)

    # StepLR 和 CosineAnnealingLR
    scheduler.step()

    # ReduceLROnPlateau(需要传入监控指标)
    # scheduler.step(train_loss)

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch+1}, LR: {current_lr:.6f}")

实战:图像分类与迁移学习

1. 使用预训练模型

python
from torchvision import models

# 加载预训练的 ResNet-18
model = models.resnet18(pretrained=True)

# 冻结所有层
for param in model.parameters():
    param.requires_grad = False

# 替换最后的全连接层(1000 类 → 10 类)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)

# 只训练最后一层
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

print(model)

2. 自定义数据集

python
from torch.utils.data import Dataset
from PIL import Image
import os

class CustomImageDataset(Dataset):
    """
    自定义图像数据集

    目录结构:
    data/
        class1/
            img1.jpg
            img2.jpg
        class2/
            img3.jpg
            img4.jpg
    """

    def __init__(self, root_dir: str, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = sorted(os.listdir(root_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        # 收集所有图像路径和标签
        self.images = []
        self.labels = []

        for class_name in self.classes:
            class_dir = os.path.join(root_dir, class_name)
            for img_name in os.listdir(class_dir):
                if img_name.endswith(('.jpg', '.png', '.jpeg')):
                    self.images.append(os.path.join(class_dir, img_name))
                    self.labels.append(self.class_to_idx[class_name])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label = self.labels[idx]

        # 加载图像
        image = Image.open(img_path).convert('RGB')

        # 应用转换
        if self.transform:
            image = self.transform(image)

        return image, label

# 使用示例
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# dataset = CustomImageDataset('path/to/data', transform=transform)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

3. 数据增强

python
from torchvision import transforms

# 训练时的数据增强
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),  # 随机裁剪
    transforms.RandomHorizontalFlip(),  # 随机水平翻转
    transforms.RandomRotation(15),  # 随机旋转
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),  # 颜色抖动
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# 测试时不增强
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

TensorFlow/Keras 快速入门

虽然我们主要使用 PyTorch,但了解 TensorFlow 也很有价值。

python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 1. 构建模型(Sequential API)
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax')
])

# 2. 编译模型
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# 3. 训练模型
# history = model.fit(
#     x_train, y_train,
#     batch_size=64,
#     epochs=10,
#     validation_split=0.2
# )

# 4. 评估模型
# test_loss, test_acc = model.evaluate(x_test, y_test)

# 5. 预测
# predictions = model.predict(x_test)

Functional API(更灵活):

python
# 输入层
inputs = keras.Input(shape=(28, 28))

# 隐藏层
x = layers.Flatten()(inputs)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.2)(x)

# 输出层
outputs = layers.Dense(10, activation='softmax')(x)

# 创建模型
model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

PyTorch vs TensorFlow 代码对比

模型定义

python
# PyTorch
class PyTorchModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# TensorFlow/Keras
def create_keras_model():
    return keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(784,)),
        layers.Dense(10)
    ])

训练循环

python
# PyTorch(手动循环)
for epoch in range(num_epochs):
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# TensorFlow/Keras(自动)
model.fit(x_train, y_train, epochs=num_epochs, batch_size=64)

高级技巧

1. 混合精度训练

python
from torch.cuda.amp import autocast, GradScaler

# 创建梯度缩放器
scaler = GradScaler()

for images, labels in train_loader:
    optimizer.zero_grad()

    # 自动混合精度
    with autocast():
        outputs = model(images)
        loss = criterion(outputs, labels)

    # 缩放梯度
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

2. 梯度裁剪

python
# 防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

3. 模型保存和加载

python
# 保存整个模型
torch.save(model, 'model.pth')
loaded_model = torch.load('model.pth')

# 只保存参数(推荐)
torch.save(model.state_dict(), 'model_weights.pth')

# 加载参数
model = MNISTNet()
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

# 保存检查点(包含优化器状态)
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}
torch.save(checkpoint, 'checkpoint.pth')

# 加载检查点
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

小结

在本节中,我们学习了:

PyTorch 基础

  • 张量操作和 GPU 加速
  • 自动微分(Autograd)
  • 梯度计算和优化

构建神经网络

  • nn.Module 设计模式
  • 常用层和激活函数
  • 自定义网络架构

训练流程

  • 数据加载(Dataset 和 DataLoader)
  • 完整的训练循环
  • 模型评估和保存

实战应用

  • MNIST 手写数字识别
  • 迁移学习和预训练模型
  • 自定义数据集

TensorFlow/Keras

  • Sequential 和 Functional API
  • 与 PyTorch 的对比

高级技巧

  • 混合精度训练
  • 梯度裁剪
  • 学习率调度

练习题

基础题

  1. 创建一个 3x4 的随机张量,计算其转置并验证形状
  2. 实现一个简单的线性回归模型,拟合 y = 2x + 1
  3. 使用 nn.Sequential 构建一个三层全连接网络

进阶题

  1. 在 MNIST 数据集上训练一个卷积神经网络(CNN)
  2. 实现一个自定义的 Dataset 类,加载本地图像文件
  3. 使用预训练的 ResNet 进行迁移学习,在 CIFAR-10 上微调

挑战题

  1. 实现学习率预热(Warmup)+ 余弦退火调度器
  2. 对比不同优化器(SGD、Adam、AdamW)在同一任务上的性能
  3. 实现梯度累积,模拟更大的批量大小

下一节:10.5 神经网络原理:从感知机到 Transformer

在下一节,我们将深入神经网络的数学原理,理解反向传播算法,并最终实现 Transformer 架构——现代大语言模型的基石!

基于 MIT 许可证发布。内容版权归作者所有。