10.4 深度学习框架:PyTorch 与 TensorFlow
从机器学习到深度学习
在上一节,我们学习了 Scikit-Learn 中的经典机器学习算法。现在,让我们迈向更强大的深度学习世界。
💡 核心问题:为什么需要深度学习?
传统机器学习:需要人工设计特征(feature engineering)
图像 → 手工提取边缘、纹理等特征 → 分类器 → 预测深度学习:自动学习特征表示
图像 → 神经网络(自动学习层次化特征)→ 预测
深度学习在以下任务中表现卓越:
- 计算机视觉:图像分类、目标检测、图像生成
- 自然语言处理:机器翻译、文本生成、情感分析
- 语音识别:语音转文字、语音合成
- 强化学习:游戏 AI、机器人控制
PyTorch vs TensorFlow:两大框架对比
| 特性 | PyTorch | TensorFlow |
|---|---|---|
| 开发者 | Meta (Facebook) | |
| 计算图 | 动态图(Define-by-Run) | 2.x 默认动态图(Eager Execution) |
| 易用性 | 更 Pythonic,易于调试 | Keras API 简化了使用 |
| 生态系统 | 学术界主流,Hugging Face | 工业界广泛应用,TensorFlow Serving |
| 移动部署 | TorchScript, PyTorch Mobile | TensorFlow Lite |
| 性能 | 优秀 | 优秀 |
📌 本章重点:我们将深入学习 PyTorch,因为它在研究界和 AI Agent 开发中更受欢迎。最后会简要介绍 TensorFlow/Keras。
PyTorch 基础:张量操作
1. 张量(Tensor):深度学习的基本单元
PyTorch 的 torch.Tensor 和 NumPy 的 ndarray 非常相似,但有两个关键优势:
- GPU 加速:可以在 GPU 上进行计算
- 自动微分:可以自动计算梯度
python
import torch
import numpy as np
# 创建张量的多种方式
# 1. 从 Python 列表创建
x = torch.tensor([1, 2, 3, 4, 5])
print(f"x: {x}, dtype: {x.dtype}")
# 2. 从 NumPy 数组创建
np_array = np.array([1.0, 2.0, 3.0])
tensor_from_numpy = torch.from_numpy(np_array)
print(f"From NumPy: {tensor_from_numpy}")
# 3. 创建特殊张量
zeros = torch.zeros(3, 4) # 3x4 零矩阵
ones = torch.ones(2, 3) # 2x3 全 1 矩阵
rand = torch.rand(2, 3) # 2x3 随机矩阵 [0, 1)
randn = torch.randn(2, 3) # 2x3 标准正态分布
eye = torch.eye(5) # 5x5 单位矩阵
print(f"zeros:\n{zeros}")
print(f"randn:\n{randn}")
# 4. 指定数据类型
x_float = torch.tensor([1, 2, 3], dtype=torch.float32)
x_double = torch.tensor([1, 2, 3], dtype=torch.float64)
x_int = torch.tensor([1, 2, 3], dtype=torch.int32)
print(f"float32: {x_float.dtype}, float64: {x_double.dtype}, int32: {x_int.dtype}")
# 5. 张量的形状操作
x = torch.randn(2, 3, 4) # 2x3x4 张量
print(f"原始形状: {x.shape}")
# 重塑
x_reshaped = x.view(2, 12) # 变为 2x12
print(f"重塑后: {x_reshaped.shape}")
# 自动推断维度
x_reshaped2 = x.view(2, -1) # -1 表示自动计算
print(f"自动推断: {x_reshaped2.shape}")
# 转置
x_2d = torch.randn(3, 4)
x_transposed = x_2d.t() # 转置
print(f"转置前: {x_2d.shape}, 转置后: {x_transposed.shape}")
# 多维转置
x_3d = torch.randn(2, 3, 4)
x_permuted = x_3d.permute(2, 0, 1) # (2,3,4) → (4,2,3)
print(f"permute 后: {x_permuted.shape}")2. 张量运算
python
# 基本运算
a = torch.tensor([1.0, 2.0, 3.0])
b = torch.tensor([4.0, 5.0, 6.0])
# 逐元素运算
print(f"加法: {a + b}")
print(f"减法: {a - b}")
print(f"乘法: {a * b}")
print(f"除法: {a / b}")
print(f"幂运算: {a ** 2}")
# 矩阵运算
A = torch.randn(3, 4)
B = torch.randn(4, 5)
# 矩阵乘法
C = torch.mm(A, B) # 或者 A @ B
print(f"矩阵乘法: {A.shape} @ {B.shape} = {C.shape}")
# 批量矩阵乘法
batch_A = torch.randn(10, 3, 4) # 10 个 3x4 矩阵
batch_B = torch.randn(10, 4, 5) # 10 个 4x5 矩阵
batch_C = torch.bmm(batch_A, batch_B) # 10 个 3x5 矩阵
print(f"批量矩阵乘法: {batch_C.shape}")
# 统计运算
x = torch.randn(3, 4)
print(f"求和: {x.sum()}")
print(f"均值: {x.mean()}")
print(f"最大值: {x.max()}")
print(f"沿轴求和: {x.sum(dim=0).shape}") # 沿第 0 维
print(f"沿轴求和: {x.sum(dim=1).shape}") # 沿第 1 维3. GPU 加速
python
# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 将张量移动到 GPU
x_cpu = torch.randn(1000, 1000)
x_gpu = x_cpu.to(device) # 或者 x_cpu.cuda()
# 在 GPU 上计算
y_gpu = x_gpu @ x_gpu.t()
# 移回 CPU
y_cpu = y_gpu.to('cpu') # 或者 y_gpu.cpu()
# 性能对比
import time
# CPU 计算
x_cpu = torch.randn(5000, 5000)
start = time.time()
y_cpu = x_cpu @ x_cpu.t()
cpu_time = time.time() - start
# GPU 计算(如果可用)
if torch.cuda.is_available():
x_gpu = x_cpu.to('cuda')
torch.cuda.synchronize() # 等待 GPU 完成
start = time.time()
y_gpu = x_gpu @ x_gpu.t()
torch.cuda.synchronize()
gpu_time = time.time() - start
print(f"CPU 时间: {cpu_time:.4f}s")
print(f"GPU 时间: {gpu_time:.4f}s")
print(f"加速比: {cpu_time / gpu_time:.2f}x")
else:
print("GPU 不可用")自动微分(Autograd):深度学习的核心
深度学习的本质是通过反向传播算法计算梯度并更新参数。PyTorch 的 autograd 模块可以自动计算梯度。
1. 基础概念
python
# 创建需要梯度的张量
x = torch.tensor([2.0, 3.0], requires_grad=True)
# 定义计算
y = x ** 2 + 3 * x + 1
# 计算梯度
# dy/dx = 2x + 3
# 在 x=[2,3] 处,dy/dx = [7, 9]
loss = y.sum() # 标量损失(反向传播需要标量)
loss.backward() # 计算梯度
print(f"x: {x}")
print(f"y: {y}")
print(f"x.grad: {x.grad}") # dy/dx2. 梯度累积
python
x = torch.tensor([1.0, 2.0], requires_grad=True)
# 第一次计算
y = x ** 2
y.sum().backward()
print(f"第一次梯度: {x.grad}")
# 第二次计算(梯度会累积!)
y = x ** 3
y.sum().backward()
print(f"累积后梯度: {x.grad}")
# 清零梯度
x.grad.zero_()
y = x ** 3
y.sum().backward()
print(f"清零后梯度: {x.grad}")3. 停止梯度追踪
python
x = torch.tensor([1.0, 2.0], requires_grad=True)
# 方法 1:torch.no_grad()
with torch.no_grad():
y = x * 2
print(f"y.requires_grad: {y.requires_grad}") # False
# 方法 2:detach()
y = x * 2
y_detached = y.detach()
print(f"y_detached.requires_grad: {y_detached.requires_grad}") # False4. 手动实现梯度下降
python
# 目标:找到 y = (x - 3)^2 的最小值
x = torch.tensor([0.0], requires_grad=True)
learning_rate = 0.1
for epoch in range(100):
# 前向传播
y = (x - 3) ** 2
# 反向传播
y.backward()
# 更新参数(注意:不能在计算图中更新)
with torch.no_grad():
x -= learning_rate * x.grad
# 清零梯度
x.grad.zero_()
if (epoch + 1) % 20 == 0:
print(f"Epoch {epoch+1}: x = {x.item():.4f}, y = {y.item():.4f}")
print(f"\n最优解: x = {x.item():.4f} (理论值: 3.0)")构建神经网络:nn.Module
1. 第一个神经网络
python
import torch.nn as nn
import torch.nn.functional as F
class SimpleNet(nn.Module):
"""
简单的全连接神经网络
架构:
输入层 (784) → 隐藏层 (128) → ReLU → 输出层 (10) → Softmax
"""
def __init__(self, input_dim: int = 784, hidden_dim: int = 128, output_dim: int = 10):
super(SimpleNet, self).__init__()
# 定义层
self.fc1 = nn.Linear(input_dim, hidden_dim) # 全连接层 1
self.fc2 = nn.Linear(hidden_dim, output_dim) # 全连接层 2
def forward(self, x):
"""前向传播"""
# x: (batch_size, input_dim)
# 隐藏层
x = self.fc1(x) # (batch_size, hidden_dim)
x = F.relu(x) # 激活函数
# 输出层
x = self.fc2(x) # (batch_size, output_dim)
return x
# 创建模型
model = SimpleNet(input_dim=784, hidden_dim=128, output_dim=10)
print(model)
# 查看参数
print("\n模型参数:")
for name, param in model.named_parameters():
print(f"{name:15s}: {param.shape}")
# 统计参数数量
total_params = sum(p.numel() for p in model.parameters())
print(f"\n总参数数: {total_params:,}")
# 前向传播
x = torch.randn(32, 784) # 批量大小 32
output = model(x)
print(f"\n输入形状: {x.shape}")
print(f"输出形状: {output.shape}")2. 常用层
python
# 1. 全连接层(Linear)
fc = nn.Linear(in_features=100, out_features=50)
# 2. 卷积层(Conv2d)
conv = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
# 3. 池化层(MaxPool2d)
pool = nn.MaxPool2d(kernel_size=2, stride=2)
# 4. Dropout(防止过拟合)
dropout = nn.Dropout(p=0.5)
# 5. BatchNorm(批量归一化)
bn = nn.BatchNorm1d(num_features=100)
# 6. 激活函数
relu = nn.ReLU()
sigmoid = nn.Sigmoid()
tanh = nn.Tanh()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
# 7. 循环层
rnn = nn.RNN(input_size=100, hidden_size=50, num_layers=2)
lstm = nn.LSTM(input_size=100, hidden_size=50, num_layers=2)
gru = nn.GRU(input_size=100, hidden_size=50, num_layers=2)3. 更复杂的网络
python
class DeepNet(nn.Module):
"""
深层神经网络,带有 Dropout 和 BatchNorm
"""
def __init__(self):
super(DeepNet, self).__init__()
self.fc1 = nn.Linear(784, 512)
self.bn1 = nn.BatchNorm1d(512)
self.fc2 = nn.Linear(512, 256)
self.bn2 = nn.BatchNorm1d(256)
self.fc3 = nn.Linear(256, 128)
self.bn3 = nn.BatchNorm1d(128)
self.fc4 = nn.Linear(128, 10)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# 层 1
x = self.fc1(x)
x = self.bn1(x)
x = F.relu(x)
x = self.dropout(x)
# 层 2
x = self.fc2(x)
x = self.bn2(x)
x = F.relu(x)
x = self.dropout(x)
# 层 3
x = self.fc3(x)
x = self.bn3(x)
x = F.relu(x)
x = self.dropout(x)
# 输出层
x = self.fc4(x)
return x
model = DeepNet()
print(model)4. 使用 nn.Sequential
python
# 方法 1:顺序定义
model = nn.Sequential(
nn.Linear(784, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, 10)
)
# 方法 2:有序字典
from collections import OrderedDict
model = nn.Sequential(OrderedDict([
('fc1', nn.Linear(784, 512)),
('relu1', nn.ReLU()),
('dropout1', nn.Dropout(0.5)),
('fc2', nn.Linear(512, 256)),
('relu2', nn.ReLU()),
('dropout2', nn.Dropout(0.5)),
('fc3', nn.Linear(256, 10))
]))
print(model)训练循环:完整的训练流程
1. 数据加载
python
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
# 定义数据转换
transform = transforms.Compose([
transforms.ToTensor(), # 转换为张量
transforms.Normalize((0.5,), (0.5,)) # 归一化
])
# 加载 MNIST 数据集
train_dataset = datasets.MNIST(
root='./data',
train=True,
transform=transform,
download=True
)
test_dataset = datasets.MNIST(
root='./data',
train=False,
transform=transform,
download=True
)
# 创建 DataLoader
train_loader = DataLoader(
train_dataset,
batch_size=64,
shuffle=True,
num_workers=2
)
test_loader = DataLoader(
test_dataset,
batch_size=64,
shuffle=False,
num_workers=2
)
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")
# 查看一个批次
images, labels = next(iter(train_loader))
print(f"批次图像形状: {images.shape}") # (64, 1, 28, 28)
print(f"批次标签形状: {labels.shape}") # (64,)2. 完整的训练脚本
python
import torch
import torch.nn as nn
import torch.optim as optim
from typing import Tuple
class MNISTNet(nn.Module):
"""MNIST 分类网络"""
def __init__(self):
super(MNISTNet, self).__init__()
self.fc1 = nn.Linear(28 * 28, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 10)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = x.view(-1, 28 * 28) # 展平
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
def train_epoch(
model: nn.Module,
dataloader: DataLoader,
criterion: nn.Module,
optimizer: optim.Optimizer,
device: torch.device
) -> Tuple[float, float]:
"""训练一个 epoch"""
model.train() # 设置为训练模式
running_loss = 0.0
correct = 0
total = 0
for images, labels in dataloader:
# 移动到设备
images, labels = images.to(device), labels.to(device)
# 清零梯度
optimizer.zero_grad()
# 前向传播
outputs = model(images)
loss = criterion(outputs, labels)
# 反向传播
loss.backward()
# 更新参数
optimizer.step()
# 统计
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
epoch_loss = running_loss / len(dataloader)
epoch_acc = 100.0 * correct / total
return epoch_loss, epoch_acc
def evaluate(
model: nn.Module,
dataloader: DataLoader,
criterion: nn.Module,
device: torch.device
) -> Tuple[float, float]:
"""评估模型"""
model.eval() # 设置为评估模式
running_loss = 0.0
correct = 0
total = 0
with torch.no_grad(): # 不计算梯度
for images, labels in dataloader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
epoch_loss = running_loss / len(dataloader)
epoch_acc = 100.0 * correct / total
return epoch_loss, epoch_acc
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
# 创建模型
model = MNISTNet().to(device)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练
num_epochs = 10
best_acc = 0.0
for epoch in range(num_epochs):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Epoch {epoch+1}/{num_epochs}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")
# 保存最佳模型
if test_acc > best_acc:
best_acc = test_acc
torch.save(model.state_dict(), 'best_model.pth')
print(f" → 保存最佳模型(准确率: {best_acc:.2f}%)")
print(f"\n训练完成!最佳测试准确率: {best_acc:.2f}%")3. 学习率调度
python
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
# 方法 1:每 N 个 epoch 降低学习率
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
# 方法 2:当指标不再改善时降低学习率
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)
# 方法 3:余弦退火
scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)
# 在训练循环中使用
for epoch in range(num_epochs):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
# StepLR 和 CosineAnnealingLR
scheduler.step()
# ReduceLROnPlateau(需要传入监控指标)
# scheduler.step(train_loss)
current_lr = optimizer.param_groups[0]['lr']
print(f"Epoch {epoch+1}, LR: {current_lr:.6f}")实战:图像分类与迁移学习
1. 使用预训练模型
python
from torchvision import models
# 加载预训练的 ResNet-18
model = models.resnet18(pretrained=True)
# 冻结所有层
for param in model.parameters():
param.requires_grad = False
# 替换最后的全连接层(1000 类 → 10 类)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)
# 只训练最后一层
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
print(model)2. 自定义数据集
python
from torch.utils.data import Dataset
from PIL import Image
import os
class CustomImageDataset(Dataset):
"""
自定义图像数据集
目录结构:
data/
class1/
img1.jpg
img2.jpg
class2/
img3.jpg
img4.jpg
"""
def __init__(self, root_dir: str, transform=None):
self.root_dir = root_dir
self.transform = transform
self.classes = sorted(os.listdir(root_dir))
self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
# 收集所有图像路径和标签
self.images = []
self.labels = []
for class_name in self.classes:
class_dir = os.path.join(root_dir, class_name)
for img_name in os.listdir(class_dir):
if img_name.endswith(('.jpg', '.png', '.jpeg')):
self.images.append(os.path.join(class_dir, img_name))
self.labels.append(self.class_to_idx[class_name])
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_path = self.images[idx]
label = self.labels[idx]
# 加载图像
image = Image.open(img_path).convert('RGB')
# 应用转换
if self.transform:
image = self.transform(image)
return image, label
# 使用示例
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# dataset = CustomImageDataset('path/to/data', transform=transform)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)3. 数据增强
python
from torchvision import transforms
# 训练时的数据增强
train_transform = transforms.Compose([
transforms.Resize((256, 256)),
transforms.RandomCrop(224), # 随机裁剪
transforms.RandomHorizontalFlip(), # 随机水平翻转
transforms.RandomRotation(15), # 随机旋转
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), # 颜色抖动
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# 测试时不增强
test_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])TensorFlow/Keras 快速入门
虽然我们主要使用 PyTorch,但了解 TensorFlow 也很有价值。
python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# 1. 构建模型(Sequential API)
model = keras.Sequential([
layers.Flatten(input_shape=(28, 28)),
layers.Dense(512, activation='relu'),
layers.Dropout(0.2),
layers.Dense(256, activation='relu'),
layers.Dropout(0.2),
layers.Dense(10, activation='softmax')
])
# 2. 编译模型
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# 3. 训练模型
# history = model.fit(
# x_train, y_train,
# batch_size=64,
# epochs=10,
# validation_split=0.2
# )
# 4. 评估模型
# test_loss, test_acc = model.evaluate(x_test, y_test)
# 5. 预测
# predictions = model.predict(x_test)Functional API(更灵活):
python
# 输入层
inputs = keras.Input(shape=(28, 28))
# 隐藏层
x = layers.Flatten()(inputs)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.2)(x)
# 输出层
outputs = layers.Dense(10, activation='softmax')(x)
# 创建模型
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)PyTorch vs TensorFlow 代码对比
模型定义
python
# PyTorch
class PyTorchModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
# TensorFlow/Keras
def create_keras_model():
return keras.Sequential([
layers.Dense(128, activation='relu', input_shape=(784,)),
layers.Dense(10)
])训练循环
python
# PyTorch(手动循环)
for epoch in range(num_epochs):
for images, labels in train_loader:
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# TensorFlow/Keras(自动)
model.fit(x_train, y_train, epochs=num_epochs, batch_size=64)高级技巧
1. 混合精度训练
python
from torch.cuda.amp import autocast, GradScaler
# 创建梯度缩放器
scaler = GradScaler()
for images, labels in train_loader:
optimizer.zero_grad()
# 自动混合精度
with autocast():
outputs = model(images)
loss = criterion(outputs, labels)
# 缩放梯度
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()2. 梯度裁剪
python
# 防止梯度爆炸
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)3. 模型保存和加载
python
# 保存整个模型
torch.save(model, 'model.pth')
loaded_model = torch.load('model.pth')
# 只保存参数(推荐)
torch.save(model.state_dict(), 'model_weights.pth')
# 加载参数
model = MNISTNet()
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()
# 保存检查点(包含优化器状态)
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}
torch.save(checkpoint, 'checkpoint.pth')
# 加载检查点
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']小结
在本节中,我们学习了:
✅ PyTorch 基础
- 张量操作和 GPU 加速
- 自动微分(Autograd)
- 梯度计算和优化
✅ 构建神经网络
- nn.Module 设计模式
- 常用层和激活函数
- 自定义网络架构
✅ 训练流程
- 数据加载(Dataset 和 DataLoader)
- 完整的训练循环
- 模型评估和保存
✅ 实战应用
- MNIST 手写数字识别
- 迁移学习和预训练模型
- 自定义数据集
✅ TensorFlow/Keras
- Sequential 和 Functional API
- 与 PyTorch 的对比
✅ 高级技巧
- 混合精度训练
- 梯度裁剪
- 学习率调度
练习题
基础题
- 创建一个 3x4 的随机张量,计算其转置并验证形状
- 实现一个简单的线性回归模型,拟合 y = 2x + 1
- 使用 nn.Sequential 构建一个三层全连接网络
进阶题
- 在 MNIST 数据集上训练一个卷积神经网络(CNN)
- 实现一个自定义的 Dataset 类,加载本地图像文件
- 使用预训练的 ResNet 进行迁移学习,在 CIFAR-10 上微调
挑战题
- 实现学习率预热(Warmup)+ 余弦退火调度器
- 对比不同优化器(SGD、Adam、AdamW)在同一任务上的性能
- 实现梯度累积,模拟更大的批量大小
下一节:10.5 神经网络原理:从感知机到 Transformer
在下一节,我们将深入神经网络的数学原理,理解反向传播算法,并最终实现 Transformer 架构——现代大语言模型的基石!