397 lines
17 KiB
Markdown
397 lines
17 KiB
Markdown
|
# 深度学习的实现
|
|||
|
## 在Pytorch中实现
|
|||
|
```python
|
|||
|
import torch
|
|||
|
import torch.nn as nn
|
|||
|
import torch.optim as optim
|
|||
|
from torchvision import transforms
|
|||
|
from torchvision import datasets
|
|||
|
from torch.utils.data import DataLoader
|
|||
|
import warnings
|
|||
|
|
|||
|
|
|||
|
# 数据加载与预处理
|
|||
|
transform = transforms.Compose([
|
|||
|
transforms.ToTensor(),
|
|||
|
transforms.Normalize((0.1307,), (0.3081,))
|
|||
|
])
|
|||
|
|
|||
|
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
|
|||
|
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)
|
|||
|
|
|||
|
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
|
|||
|
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
|
|||
|
|
|||
|
class MNISTCNN(nn.Module):
|
|||
|
def __init__(self):
|
|||
|
super().__init__()
|
|||
|
self.conv_layers = nn.Sequential(
|
|||
|
nn.Conv2d(1, 32, 3, padding=1),
|
|||
|
nn.ReLU(inplace=True),
|
|||
|
nn.MaxPool2d(2),
|
|||
|
nn.Conv2d(32, 64, 3, padding=1),
|
|||
|
nn.ReLU(inplace=True),
|
|||
|
nn.MaxPool2d(2),
|
|||
|
)
|
|||
|
self.fc_layers = nn.Sequential(
|
|||
|
nn.Linear(64 * 7 * 7, 128),
|
|||
|
nn.ReLU(inplace=True),
|
|||
|
nn.Dropout(0.5),
|
|||
|
nn.Linear(128, 10)
|
|||
|
)
|
|||
|
|
|||
|
def forward(self, x):
|
|||
|
x = self.conv_layers(x)
|
|||
|
x = x.view(x.size(0), -1)
|
|||
|
return self.fc_layers(x)
|
|||
|
|
|||
|
model = MNISTCNN().to('cuda')
|
|||
|
|
|||
|
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)
|
|||
|
loss_fn = nn.CrossEntropyLoss(reduction='mean')
|
|||
|
|
|||
|
num_epochs = 10
|
|||
|
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train()
|
|||
|
train_loss = 0.0
|
|||
|
for images, labels in train_loader:
|
|||
|
images, labels = images.to('cuda'), labels.to('cuda')
|
|||
|
optimizer.zero_grad()
|
|||
|
outputs = model(images)
|
|||
|
loss = loss_fn(outputs, labels)
|
|||
|
loss.backward()
|
|||
|
optimizer.step()
|
|||
|
train_loss += loss.item() * images.size(0)
|
|||
|
|
|||
|
avg_train_loss = train_loss / len(train_dataset)
|
|||
|
|
|||
|
# 验证
|
|||
|
model.eval()
|
|||
|
correct = 0
|
|||
|
total = 0
|
|||
|
with torch.no_grad():
|
|||
|
for images, labels in test_loader:
|
|||
|
images, labels = images.to('cuda'), labels.to('cuda')
|
|||
|
outputs = model(images)
|
|||
|
_, predicted = torch.max(outputs, 1)
|
|||
|
total += labels.size(0)
|
|||
|
correct += (predicted == labels).sum().item()
|
|||
|
test_acc = 100 * correct / total
|
|||
|
|
|||
|
print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Test Acc: {test_acc:.2f}%')
|
|||
|
|
|||
|
torch.save(model.state_dict(), 'mnist_cnn_model.pth')
|
|||
|
print("Saved CNN model state!")
|
|||
|
```
|
|||
|
|
|||
|
### 在Pytorch中实现深度学习需要以下步骤:
|
|||
|
`数据准备`->`模型定义`->`定义优化器 & 损失函数`->`模型训练(自动/自定义)`->`模型评估`->`保存 & 部署`
|
|||
|
|
|||
|
### 模型定义
|
|||
|
#### 基础定义模板
|
|||
|
```python
|
|||
|
import torch
|
|||
|
import torch.nn as nn
|
|||
|
|
|||
|
class MyModel(nn.Module):
|
|||
|
def __init__(self):
|
|||
|
super(MyModel, self).__init__() # 必须调用父类构造函数
|
|||
|
# 在这里定义网络层
|
|||
|
self.layer1 = nn.Linear(in_features=784, out_features=256)
|
|||
|
self.layer2 = nn.Linear(256, 10)
|
|||
|
self.relu = nn.ReLU()
|
|||
|
|
|||
|
def forward(self, x):
|
|||
|
# 定义前向传播逻辑
|
|||
|
x = self.relu(self.layer1(x))
|
|||
|
x = self.layer2(x)
|
|||
|
return x
|
|||
|
```
|
|||
|
- 他需要手动定义向前传播逻辑,即`forward`函数,并且需要手动调用父类的构造函数`super(MyModel, self).__init__()`
|
|||
|
- `nn.Module`是所有神经网络模块的基类,我们自己的模型需要继承这个类,并且实现`forward`函数
|
|||
|
- `nn.Linear`是全连接层,`in_features`是输入特征数,`out_features`是输出特征数
|
|||
|
- `nn.ReLU`是激活函数
|
|||
|
- `self.layer1 = nn.Linear(in_features=784, out_features=256)`表示定义一个全连接层,输入特征数为784,输出特征数为256
|
|||
|
- `self.layer2 = nn.Linear(256, 10)`表示定义一个全连接层,输入特征数为256,输出特征数为10
|
|||
|
- `self.relu = nn.ReLU()`表示定义一个激活函数
|
|||
|
- `x = self.relu(self.layer1(x))`表示先通过`layer1`层,再通过`relu`激活函数
|
|||
|
- `x = self.layer2(x)`表示通过`layer2`层
|
|||
|
- `return x`表示返回输出
|
|||
|
#### 使用 nn.Sequential 简化模型定义
|
|||
|
```python
|
|||
|
import torch
|
|||
|
import torch.nn as nn
|
|||
|
|
|||
|
class MyModel(nn.Module):
|
|||
|
def __init__(self):
|
|||
|
super(MyModel, self).__init__()
|
|||
|
self.model = nn.Sequential(
|
|||
|
nn.Linear(in_features=784, out_features=256),
|
|||
|
nn.ReLU(),
|
|||
|
nn.Linear(256, 10)
|
|||
|
)
|
|||
|
|
|||
|
def forward(self, x):
|
|||
|
return self.model(x)
|
|||
|
```
|
|||
|
- `nn.Sequential`是一个容器,可以将多个层按顺序组合在一起,与tensorflow的`tf.keras.Sequential`类似
|
|||
|
- `self.model = nn.Sequential(nn.Linear(in_features=784, out_features=256), nn.ReLU(), nn.Linear(256, 10))`表示定义一个包含两个全连接层和一个激活函数的模型
|
|||
|
- `return self.model(x)`表示通过`self.model`模型进行前向传播
|
|||
|
#### 在torch中有超过50种类型的层,可以参考官方文档,使用方式都一样,不断的在`nn.Module`中添加层即可
|
|||
|
| 层类型 | 定义参数 | 作用 | 示例代码 | 典型用途 |
|
|||
|
|----------------------|-------------------------------------------|------------------------------------------|--------------------------------------|----------------------|
|
|||
|
| nn.Linear | in_features (输入维度), out_features (输出维度) | 全连接层:对输入数据进行线性变换。 | self.fc = nn.Linear(784, 256) | MLP、全连接网络 |
|
|||
|
| nn.Conv2d | in_channels, out_channels, kernel_size, stride=1, padding=0 | 二维卷积层:提取局部特征(图像处理)。 | self.conv1 = nn.Conv2d(3, 64, 3, padding=1) | CNN、图像分类 |
|
|||
|
| nn.ReLU | inplace=False (默认) | 非线性激活函数:缓解梯度消失,引入非线性。 | self.relu = nn.ReLU(inplace=True) | 所有神经网络层后 |
|
|||
|
| nn.BatchNorm2d | num_features (输出通道数) | 批量归一化:加速训练,稳定梯度。 | self.bn1 = nn.BatchNorm2d(64) | CNN、ResNet |
|
|||
|
| nn.MaxPool2d | kernel_size, stride=1, padding=0 | 最大池化:下采样,减少计算量,增加平移不变性。 | self.pool = nn.MaxPool2d(2, 2) | 图像特征提取 |
|
|||
|
| nn.Dropout | p=0.5 (丢弃概率,默认0.5) | 随机失活:防止过拟合。 | self.dropout = nn.Dropout(0.5) | MLP、RNN、CNN |
|
|||
|
| nn.Embedding | num_embeddings (嵌入维度), input_dim (词汇表大小) | 嵌入层:将离散类别映射为稠密向量。 | self.embedding = nn.Embedding(10000, 128) | NLP、词嵌入 |
|
|||
|
| nn.LSTM | input_size, hidden_size, num_layers, batch_first=True | 长短期记忆网络:处理序列数据,捕捉长期依赖。 | self.lstm = nn.LSTM(100, 64, 2, batch_first=True) | 文本分类、时间序列预测 |
|
|||
|
| nn.GRU | input_size, hidden_size, num_layers, batch_first=True | 门控循环单元:简化版 LSTM,计算更高效。 | self.gru = nn.GRU(100, 64, 2, batch_first=True) | 类似 LSTM 的应用场景 |
|
|||
|
| nn.TransformerEncoderLayer | d_model, nhead, dim_k, dim_v | Transformer 编码器层:自注意力机制,处理序列数据。 | self.encoder_layer = nn.TransformerEncoderLayer(512, 8) | NLP、机器翻译 |
|
|||
|
| nn.Upsample | scale_factor (上采样倍数), mode='nearest' | 上采样:图像超分辨率,扩大特征图尺寸。 | self.up = nn.Upsample(scale_factor=2, mode='nearest') | 图像分割、生成模型 |
|
|||
|
| nn.ConvTranspose2d | in_channels, out_channels, kernel_size, stride=1, padding=0 | 转置卷积:反卷积操作,用于生成对抗网络(GAN)的输出。 | self.deconv = nn.ConvTranspose2d(64, 3, 3, padding=1) | GAN、图像修复 |
|
|||
|
| nn.Softmax | dim=-1 (默认) | 概率分布:将输出转换为分类概率。 | self.softmax = nn.Softmax(dim=1) | 分类任务最终层 |
|
|||
|
| nn.CrossEntropyLoss | ignore_index=-1 (可选) | 结合 Softmax 和 Cross-Entropy 的损失函数,用于分类任务。 | criterion = nn.CrossEntropyLoss() | 分类任务(无需手动加 Softmax)|
|
|||
|
| nn.MSELoss | | 均方误差损失:回归任务。 | criterion = nn.MSELoss() | 回归、坐标预测 |
|
|||
|
### 定义优化器和损失函数
|
|||
|
一、定义优化器 (Optimizer)
|
|||
|
优化器用于更新模型的参数,常见的算法包括SGD、Adam、RMSProp等。PyTorch的torch.optim模块提供了多种优化器。
|
|||
|
1. 常用优化器示例
|
|||
|
```python
|
|||
|
import torch.optim as optim
|
|||
|
|
|||
|
# 定义优化器
|
|||
|
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
|
|||
|
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
|
|||
|
optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99)
|
|||
|
```
|
|||
|
2. 常用参数说明
|
|||
|
|
|||
|
| 参数名 | 含义 | 默认值 |
|
|||
|
|------------|----------------------------------|---------|
|
|||
|
| params | 需要优化的参数(通常是model.parameters()) | 必须参数 |
|
|||
|
| lr | 学习率(Learning Rate) | 1e-3 |
|
|||
|
| momentum | 动量项(仅SGD系列有效) | 0 |
|
|||
|
| weight_decay | 权重衰减(L2正则化) | 0 |
|
|||
|
| betas | Adam的β1和β2超参数(平衡梯度指数衰减率) | (0.9, 0.99) |
|
|||
|
| eps | Adam的极小值防止除零错误 | 1e-8(默认为1e-7) |
|
|||
|
|
|||
|
二、定义损失函数 (Loss Function)
|
|||
|
损失函数用于衡量模型的预测结果与真实标签之间的差异,常见的损失函数包括均方误差损失、交叉熵损失等。PyTorch的torch.nn模块提供了多种损失函数。
|
|||
|
1. 常用损失函数示例
|
|||
|
```python
|
|||
|
import torch.nn as nn
|
|||
|
|
|||
|
# 分类任务(如图像分类)
|
|||
|
criterion = nn.CrossEntropyLoss(
|
|||
|
reduction='mean' # 可选'mean'或'sum'
|
|||
|
)
|
|||
|
|
|||
|
# 回归任务(如房价预测)
|
|||
|
criterion = nn.MSELoss(
|
|||
|
reduction='mse' # 可选'mse', 'mae', 'sum', 'mean'
|
|||
|
)
|
|||
|
```
|
|||
|
2. 常用参数说明
|
|||
|
- reduction:指定损失函数的归约方式,可选值包括none(不聚合)、sum(求和)、mean(均值)、mse(均方误差)。默认为'mean'。
|
|||
|
- weight:指定每个类别的权重,用于加权交叉熵损失。
|
|||
|
- ignore_index:指定忽略的类别索引,用于多分类任务。
|
|||
|
### 模型训练
|
|||
|
#### 一、自动训练(标准流程)
|
|||
|
这是最常见的模式,基于 DataLoader 和 torch.nn 模块的高层抽象实现。
|
|||
|
|
|||
|
1. 核心 API
|
|||
|
|
|||
|
API | 作用 | 参数说明 |
|
|||
|
--- | --- | --- |
|
|||
|
torch.utils.data.DataLoader | 数据加载器,封装 Dataset 并分批次加载数据 | dataset: 自定义数据集对象, batch_size: 每次迭代的数据量 |
|
|||
|
model.train() | 将模型切换为训练模式(启用 Dropout/BatchNorm 等训练时行为) | 无参数 |
|
|||
|
optimizer.step() | 执行参数更新(基于梯度) | 无参数 |
|
|||
|
optimizer.zero_grad() | 清空梯度缓存,为下一次反向传播准备 | 无参数 |
|
|||
|
loss.backward() | 反向传播,计算梯度 | retain_graph=True 可保留梯度用于多次反向传播 |
|
|||
|
2. 代码示例
|
|||
|
```python
|
|||
|
# 定义模型、优化器、损失函数
|
|||
|
model = MyModel().to(device)
|
|||
|
optimizer = torch.optim.Adam(
|
|||
|
params=model.parameters(),
|
|||
|
lr=0.001, # 学习率(核心超参数)
|
|||
|
betas=(0.9, 0.999), # 动量参数
|
|||
|
eps=1e-8, # 数值稳定性保护
|
|||
|
weight_decay=0.01 # 权重衰减(正则化)
|
|||
|
)
|
|||
|
criterion = nn.CrossEntropyLoss()
|
|||
|
|
|||
|
# 训练循环
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train() # 设置训练模式
|
|||
|
for inputs, labels in dataloader: ## 注意dataloader是迭代器,每次迭代返回一个batch的数据
|
|||
|
inputs, labels = inputs.to(device), labels.to(device)
|
|||
|
|
|||
|
# 前向传播
|
|||
|
outputs = model(inputs)
|
|||
|
loss = criterion(outputs, labels)
|
|||
|
|
|||
|
# 反向传播 + 参数更新
|
|||
|
optimizer.zero_grad() # 清空梯度
|
|||
|
loss.backward() # 计算梯度
|
|||
|
optimizer.step() # 更新参数
|
|||
|
|
|||
|
# 打印日志
|
|||
|
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
|
|||
|
```
|
|||
|
#### 二、自定义训练(灵活控制)
|
|||
|
- 适用于需要特殊逻辑的场景(如动态修改学习率、梯度裁剪、多任务学习等)。
|
|||
|
1. 扩展功能 API
|
|||
|
|
|||
|
API | 作用 | 参数说明 |
|
|||
|
--------------------------- | ---------------------------------------------- | --------------------------------------------- |
|
|||
|
torch.optim.lr_scheduler | 学习率调度器(如 StepLR、ReduceLROnPlateau) | optimizer: 优化器对象,step_size调度步长
|
|||
|
torch.nn.utils.clip_grad_norm_ | 梯度裁剪,防止爆炸 | parameters: 模型参数,max_norm 最大梯度范数 |
|
|||
|
with torch.no_grad(): | 禁用梯度计算,节省内存(常用于推理或评估) | 无参数 |
|
|||
|
2. 代码示例
|
|||
|
```python
|
|||
|
# 添加学习率调度器和梯度裁剪
|
|||
|
scheduler = torch.optim.StepLR(optimizer, step_size=30, gamma=0.1)
|
|||
|
gradient_clip = 5.0
|
|||
|
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train()
|
|||
|
for inputs, labels in dataloader:
|
|||
|
inputs, labels = inputs.to(device), labels.to(device)
|
|||
|
|
|||
|
# 前向传播
|
|||
|
outputs = model(inputs)
|
|||
|
loss = criterion(outputs, labels)
|
|||
|
|
|||
|
# 反向传播 + 梯度裁剪 + 参数更新
|
|||
|
optimizer.zero_grad()
|
|||
|
loss.backward()
|
|||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
|
|||
|
optimizer.step()
|
|||
|
|
|||
|
# 动态调整学习率
|
|||
|
scheduler.step()
|
|||
|
|
|||
|
# 验证集评估(自定义逻辑)
|
|||
|
if batch_idx % 100 == 0:
|
|||
|
validate(model, val_dataloader, criterion)
|
|||
|
```
|
|||
|
#### 关键的参数
|
|||
|
1. DataLoader 参数
|
|||
|
```python
|
|||
|
DataLoader(
|
|||
|
dataset=MyDataset(),
|
|||
|
batch_size=64, # 批量大小
|
|||
|
shuffle=True, # 训练时打乱数据顺序
|
|||
|
num_workers=4, # 多线程加载数据
|
|||
|
pin_memory=True # GPU 数据传输加速
|
|||
|
)
|
|||
|
```
|
|||
|
#### 高级技巧
|
|||
|
1. 混合精度训练(Mixed Precision Training)
|
|||
|
```python
|
|||
|
# 混合精度训练
|
|||
|
scaler = torch.cuda.amp.GradScaler()
|
|||
|
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
|||
|
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train()
|
|||
|
for inputs, labels in dataloader:
|
|||
|
inputs, labels = inputs.to(device), labels.to(device)
|
|||
|
|
|||
|
# 前向传播
|
|||
|
with torch.cuda.amp.autocast():
|
|||
|
outputs = model(inputs)
|
|||
|
loss = criterion(outputs, labels)
|
|||
|
|
|||
|
# 反向传播和优化
|
|||
|
optimizer.zero_grad()
|
|||
|
scaler.scale(loss).backward()
|
|||
|
scaler.step(optimizer)
|
|||
|
scaler.update()
|
|||
|
```
|
|||
|
2. 分布式训练(Distributed Training)
|
|||
|
```python
|
|||
|
# 分布式训练
|
|||
|
torch.distributed.init_process_group(backend='nccl')
|
|||
|
model = model.to(device)
|
|||
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device])
|
|||
|
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train()
|
|||
|
for inputs, labels in dataloader:
|
|||
|
inputs, labels = inputs.to(device), labels.to(device)
|
|||
|
|
|||
|
# 前向传播
|
|||
|
outputs = model(inputs)
|
|||
|
loss = criterion(outputs, labels)
|
|||
|
|
|||
|
# 反向传播和优化
|
|||
|
optimizer.zero_grad()
|
|||
|
loss.backward()
|
|||
|
optimizer.step()
|
|||
|
```
|
|||
|
3. 梯度检查点(Gradient Checkpointing)
|
|||
|
```python
|
|||
|
# 梯度检查点
|
|||
|
from torch.utils.checkpoint import checkpoint
|
|||
|
|
|||
|
for epoch in range(num_epochs):
|
|||
|
model.train()
|
|||
|
for inputs, labels in dataloader:
|
|||
|
inputs, labels = inputs.to(device), labels.to(device)
|
|||
|
|
|||
|
# 前向传播
|
|||
|
def forward(inputs):
|
|||
|
return model(inputs)
|
|||
|
|
|||
|
outputs = checkpoint(forward, inputs)
|
|||
|
|
|||
|
# 反向传播和优化
|
|||
|
optimizer.zero_grad()
|
|||
|
loss = criterion(outputs, labels)
|
|||
|
loss.backward()
|
|||
|
optimizer.step()
|
|||
|
```
|
|||
|
### 模型评估
|
|||
|
```python
|
|||
|
import torch
|
|||
|
from sklearn.metrics import classification_report
|
|||
|
|
|||
|
def run_evaluation(model, test_loader, class_names):
|
|||
|
model.eval()
|
|||
|
all_preds = []
|
|||
|
all_labels = []
|
|||
|
|
|||
|
with torch.no_grad():
|
|||
|
for inputs, labels in test_loader:
|
|||
|
outputs = model(inputs)
|
|||
|
preds = torch.argmax(outputs, dim=1)
|
|||
|
|
|||
|
all_preds.extend(preds.cpu().numpy())
|
|||
|
all_labels.extend(labels.cpu().numpy())
|
|||
|
|
|||
|
# 打印报告
|
|||
|
print(classification_report(all_labels, all_preds, target_names=class_names))
|
|||
|
|
|||
|
# 返回字典格式结果
|
|||
|
return {
|
|||
|
'accuracy': sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels),
|
|||
|
'classification_report': classification_report(all_labels, all_preds, target_names=class_names)
|
|||
|
}
|
|||
|
```
|
|||
|
### 模型保存与加载
|
|||
|
```python
|
|||
|
# 保存模型
|
|||
|
torch.save(model.state_dict(), 'model.pth')
|
|||
|
|
|||
|
# 加载模型
|
|||
|
model = YourModelClass()
|
|||
|
model.load_state_dict(torch.load('model.pth'))
|
|||
|
model.eval()
|
|||
|
```
|