397 lines
17 KiB
Markdown
397 lines
17 KiB
Markdown
# 深度学习的实现
|
||
## 在Pytorch中实现
|
||
```python
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.optim as optim
|
||
from torchvision import transforms
|
||
from torchvision import datasets
|
||
from torch.utils.data import DataLoader
|
||
import warnings
|
||
|
||
|
||
# 数据加载与预处理
|
||
transform = transforms.Compose([
|
||
transforms.ToTensor(),
|
||
transforms.Normalize((0.1307,), (0.3081,))
|
||
])
|
||
|
||
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
|
||
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)
|
||
|
||
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
|
||
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
|
||
|
||
class MNISTCNN(nn.Module):
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.conv_layers = nn.Sequential(
|
||
nn.Conv2d(1, 32, 3, padding=1),
|
||
nn.ReLU(inplace=True),
|
||
nn.MaxPool2d(2),
|
||
nn.Conv2d(32, 64, 3, padding=1),
|
||
nn.ReLU(inplace=True),
|
||
nn.MaxPool2d(2),
|
||
)
|
||
self.fc_layers = nn.Sequential(
|
||
nn.Linear(64 * 7 * 7, 128),
|
||
nn.ReLU(inplace=True),
|
||
nn.Dropout(0.5),
|
||
nn.Linear(128, 10)
|
||
)
|
||
|
||
def forward(self, x):
|
||
x = self.conv_layers(x)
|
||
x = x.view(x.size(0), -1)
|
||
return self.fc_layers(x)
|
||
|
||
model = MNISTCNN().to('cuda')
|
||
|
||
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)
|
||
loss_fn = nn.CrossEntropyLoss(reduction='mean')
|
||
|
||
num_epochs = 10
|
||
|
||
for epoch in range(num_epochs):
|
||
model.train()
|
||
train_loss = 0.0
|
||
for images, labels in train_loader:
|
||
images, labels = images.to('cuda'), labels.to('cuda')
|
||
optimizer.zero_grad()
|
||
outputs = model(images)
|
||
loss = loss_fn(outputs, labels)
|
||
loss.backward()
|
||
optimizer.step()
|
||
train_loss += loss.item() * images.size(0)
|
||
|
||
avg_train_loss = train_loss / len(train_dataset)
|
||
|
||
# 验证
|
||
model.eval()
|
||
correct = 0
|
||
total = 0
|
||
with torch.no_grad():
|
||
for images, labels in test_loader:
|
||
images, labels = images.to('cuda'), labels.to('cuda')
|
||
outputs = model(images)
|
||
_, predicted = torch.max(outputs, 1)
|
||
total += labels.size(0)
|
||
correct += (predicted == labels).sum().item()
|
||
test_acc = 100 * correct / total
|
||
|
||
print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Test Acc: {test_acc:.2f}%')
|
||
|
||
torch.save(model.state_dict(), 'mnist_cnn_model.pth')
|
||
print("Saved CNN model state!")
|
||
```
|
||
|
||
### 在Pytorch中实现深度学习需要以下步骤:
|
||
`数据准备`->`模型定义`->`定义优化器 & 损失函数`->`模型训练(自动/自定义)`->`模型评估`->`保存 & 部署`
|
||
|
||
### 模型定义
|
||
#### 基础定义模板
|
||
```python
|
||
import torch
|
||
import torch.nn as nn
|
||
|
||
class MyModel(nn.Module):
|
||
def __init__(self):
|
||
super(MyModel, self).__init__() # 必须调用父类构造函数
|
||
# 在这里定义网络层
|
||
self.layer1 = nn.Linear(in_features=784, out_features=256)
|
||
self.layer2 = nn.Linear(256, 10)
|
||
self.relu = nn.ReLU()
|
||
|
||
def forward(self, x):
|
||
# 定义前向传播逻辑
|
||
x = self.relu(self.layer1(x))
|
||
x = self.layer2(x)
|
||
return x
|
||
```
|
||
- 他需要手动定义向前传播逻辑,即`forward`函数,并且需要手动调用父类的构造函数`super(MyModel, self).__init__()`
|
||
- `nn.Module`是所有神经网络模块的基类,我们自己的模型需要继承这个类,并且实现`forward`函数
|
||
- `nn.Linear`是全连接层,`in_features`是输入特征数,`out_features`是输出特征数
|
||
- `nn.ReLU`是激活函数
|
||
- `self.layer1 = nn.Linear(in_features=784, out_features=256)`表示定义一个全连接层,输入特征数为784,输出特征数为256
|
||
- `self.layer2 = nn.Linear(256, 10)`表示定义一个全连接层,输入特征数为256,输出特征数为10
|
||
- `self.relu = nn.ReLU()`表示定义一个激活函数
|
||
- `x = self.relu(self.layer1(x))`表示先通过`layer1`层,再通过`relu`激活函数
|
||
- `x = self.layer2(x)`表示通过`layer2`层
|
||
- `return x`表示返回输出
|
||
#### 使用 nn.Sequential 简化模型定义
|
||
```python
|
||
import torch
|
||
import torch.nn as nn
|
||
|
||
class MyModel(nn.Module):
|
||
def __init__(self):
|
||
super(MyModel, self).__init__()
|
||
self.model = nn.Sequential(
|
||
nn.Linear(in_features=784, out_features=256),
|
||
nn.ReLU(),
|
||
nn.Linear(256, 10)
|
||
)
|
||
|
||
def forward(self, x):
|
||
return self.model(x)
|
||
```
|
||
- `nn.Sequential`是一个容器,可以将多个层按顺序组合在一起,与tensorflow的`tf.keras.Sequential`类似
|
||
- `self.model = nn.Sequential(nn.Linear(in_features=784, out_features=256), nn.ReLU(), nn.Linear(256, 10))`表示定义一个包含两个全连接层和一个激活函数的模型
|
||
- `return self.model(x)`表示通过`self.model`模型进行前向传播
|
||
#### 在torch中有超过50种类型的层,可以参考官方文档,使用方式都一样,不断的在`nn.Module`中添加层即可
|
||
| 层类型 | 定义参数 | 作用 | 示例代码 | 典型用途 |
|
||
|----------------------|-------------------------------------------|------------------------------------------|--------------------------------------|----------------------|
|
||
| nn.Linear | in_features (输入维度), out_features (输出维度) | 全连接层:对输入数据进行线性变换。 | self.fc = nn.Linear(784, 256) | MLP、全连接网络 |
|
||
| nn.Conv2d | in_channels, out_channels, kernel_size, stride=1, padding=0 | 二维卷积层:提取局部特征(图像处理)。 | self.conv1 = nn.Conv2d(3, 64, 3, padding=1) | CNN、图像分类 |
|
||
| nn.ReLU | inplace=False (默认) | 非线性激活函数:缓解梯度消失,引入非线性。 | self.relu = nn.ReLU(inplace=True) | 所有神经网络层后 |
|
||
| nn.BatchNorm2d | num_features (输出通道数) | 批量归一化:加速训练,稳定梯度。 | self.bn1 = nn.BatchNorm2d(64) | CNN、ResNet |
|
||
| nn.MaxPool2d | kernel_size, stride=1, padding=0 | 最大池化:下采样,减少计算量,增加平移不变性。 | self.pool = nn.MaxPool2d(2, 2) | 图像特征提取 |
|
||
| nn.Dropout | p=0.5 (丢弃概率,默认0.5) | 随机失活:防止过拟合。 | self.dropout = nn.Dropout(0.5) | MLP、RNN、CNN |
|
||
| nn.Embedding | num_embeddings (嵌入维度), input_dim (词汇表大小) | 嵌入层:将离散类别映射为稠密向量。 | self.embedding = nn.Embedding(10000, 128) | NLP、词嵌入 |
|
||
| nn.LSTM | input_size, hidden_size, num_layers, batch_first=True | 长短期记忆网络:处理序列数据,捕捉长期依赖。 | self.lstm = nn.LSTM(100, 64, 2, batch_first=True) | 文本分类、时间序列预测 |
|
||
| nn.GRU | input_size, hidden_size, num_layers, batch_first=True | 门控循环单元:简化版 LSTM,计算更高效。 | self.gru = nn.GRU(100, 64, 2, batch_first=True) | 类似 LSTM 的应用场景 |
|
||
| nn.TransformerEncoderLayer | d_model, nhead, dim_k, dim_v | Transformer 编码器层:自注意力机制,处理序列数据。 | self.encoder_layer = nn.TransformerEncoderLayer(512, 8) | NLP、机器翻译 |
|
||
| nn.Upsample | scale_factor (上采样倍数), mode='nearest' | 上采样:图像超分辨率,扩大特征图尺寸。 | self.up = nn.Upsample(scale_factor=2, mode='nearest') | 图像分割、生成模型 |
|
||
| nn.ConvTranspose2d | in_channels, out_channels, kernel_size, stride=1, padding=0 | 转置卷积:反卷积操作,用于生成对抗网络(GAN)的输出。 | self.deconv = nn.ConvTranspose2d(64, 3, 3, padding=1) | GAN、图像修复 |
|
||
| nn.Softmax | dim=-1 (默认) | 概率分布:将输出转换为分类概率。 | self.softmax = nn.Softmax(dim=1) | 分类任务最终层 |
|
||
| nn.CrossEntropyLoss | ignore_index=-1 (可选) | 结合 Softmax 和 Cross-Entropy 的损失函数,用于分类任务。 | criterion = nn.CrossEntropyLoss() | 分类任务(无需手动加 Softmax)|
|
||
| nn.MSELoss | | 均方误差损失:回归任务。 | criterion = nn.MSELoss() | 回归、坐标预测 |
|
||
### 定义优化器和损失函数
|
||
一、定义优化器 (Optimizer)
|
||
优化器用于更新模型的参数,常见的算法包括SGD、Adam、RMSProp等。PyTorch的torch.optim模块提供了多种优化器。
|
||
1. 常用优化器示例
|
||
```python
|
||
import torch.optim as optim
|
||
|
||
# 定义优化器
|
||
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
|
||
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
|
||
optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99)
|
||
```
|
||
2. 常用参数说明
|
||
|
||
| 参数名 | 含义 | 默认值 |
|
||
|------------|----------------------------------|---------|
|
||
| params | 需要优化的参数(通常是model.parameters()) | 必须参数 |
|
||
| lr | 学习率(Learning Rate) | 1e-3 |
|
||
| momentum | 动量项(仅SGD系列有效) | 0 |
|
||
| weight_decay | 权重衰减(L2正则化) | 0 |
|
||
| betas | Adam的β1和β2超参数(平衡梯度指数衰减率) | (0.9, 0.99) |
|
||
| eps | Adam的极小值防止除零错误 | 1e-8(默认为1e-7) |
|
||
|
||
二、定义损失函数 (Loss Function)
|
||
损失函数用于衡量模型的预测结果与真实标签之间的差异,常见的损失函数包括均方误差损失、交叉熵损失等。PyTorch的torch.nn模块提供了多种损失函数。
|
||
1. 常用损失函数示例
|
||
```python
|
||
import torch.nn as nn
|
||
|
||
# 分类任务(如图像分类)
|
||
criterion = nn.CrossEntropyLoss(
|
||
reduction='mean' # 可选'mean'或'sum'
|
||
)
|
||
|
||
# 回归任务(如房价预测)
|
||
criterion = nn.MSELoss(
|
||
reduction='mse' # 可选'mse', 'mae', 'sum', 'mean'
|
||
)
|
||
```
|
||
2. 常用参数说明
|
||
- reduction:指定损失函数的归约方式,可选值包括none(不聚合)、sum(求和)、mean(均值)、mse(均方误差)。默认为'mean'。
|
||
- weight:指定每个类别的权重,用于加权交叉熵损失。
|
||
- ignore_index:指定忽略的类别索引,用于多分类任务。
|
||
### 模型训练
|
||
#### 一、自动训练(标准流程)
|
||
这是最常见的模式,基于 DataLoader 和 torch.nn 模块的高层抽象实现。
|
||
|
||
1. 核心 API
|
||
|
||
API | 作用 | 参数说明 |
|
||
--- | --- | --- |
|
||
torch.utils.data.DataLoader | 数据加载器,封装 Dataset 并分批次加载数据 | dataset: 自定义数据集对象, batch_size: 每次迭代的数据量 |
|
||
model.train() | 将模型切换为训练模式(启用 Dropout/BatchNorm 等训练时行为) | 无参数 |
|
||
optimizer.step() | 执行参数更新(基于梯度) | 无参数 |
|
||
optimizer.zero_grad() | 清空梯度缓存,为下一次反向传播准备 | 无参数 |
|
||
loss.backward() | 反向传播,计算梯度 | retain_graph=True 可保留梯度用于多次反向传播 |
|
||
2. 代码示例
|
||
```python
|
||
# 定义模型、优化器、损失函数
|
||
model = MyModel().to(device)
|
||
optimizer = torch.optim.Adam(
|
||
params=model.parameters(),
|
||
lr=0.001, # 学习率(核心超参数)
|
||
betas=(0.9, 0.999), # 动量参数
|
||
eps=1e-8, # 数值稳定性保护
|
||
weight_decay=0.01 # 权重衰减(正则化)
|
||
)
|
||
criterion = nn.CrossEntropyLoss()
|
||
|
||
# 训练循环
|
||
for epoch in range(num_epochs):
|
||
model.train() # 设置训练模式
|
||
for inputs, labels in dataloader: ## 注意dataloader是迭代器,每次迭代返回一个batch的数据
|
||
inputs, labels = inputs.to(device), labels.to(device)
|
||
|
||
# 前向传播
|
||
outputs = model(inputs)
|
||
loss = criterion(outputs, labels)
|
||
|
||
# 反向传播 + 参数更新
|
||
optimizer.zero_grad() # 清空梯度
|
||
loss.backward() # 计算梯度
|
||
optimizer.step() # 更新参数
|
||
|
||
# 打印日志
|
||
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
|
||
```
|
||
#### 二、自定义训练(灵活控制)
|
||
- 适用于需要特殊逻辑的场景(如动态修改学习率、梯度裁剪、多任务学习等)。
|
||
1. 扩展功能 API
|
||
|
||
API | 作用 | 参数说明 |
|
||
--------------------------- | ---------------------------------------------- | --------------------------------------------- |
|
||
torch.optim.lr_scheduler | 学习率调度器(如 StepLR、ReduceLROnPlateau) | optimizer: 优化器对象,step_size调度步长
|
||
torch.nn.utils.clip_grad_norm_ | 梯度裁剪,防止爆炸 | parameters: 模型参数,max_norm 最大梯度范数 |
|
||
with torch.no_grad(): | 禁用梯度计算,节省内存(常用于推理或评估) | 无参数 |
|
||
2. 代码示例
|
||
```python
|
||
# 添加学习率调度器和梯度裁剪
|
||
scheduler = torch.optim.StepLR(optimizer, step_size=30, gamma=0.1)
|
||
gradient_clip = 5.0
|
||
|
||
for epoch in range(num_epochs):
|
||
model.train()
|
||
for inputs, labels in dataloader:
|
||
inputs, labels = inputs.to(device), labels.to(device)
|
||
|
||
# 前向传播
|
||
outputs = model(inputs)
|
||
loss = criterion(outputs, labels)
|
||
|
||
# 反向传播 + 梯度裁剪 + 参数更新
|
||
optimizer.zero_grad()
|
||
loss.backward()
|
||
torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
|
||
optimizer.step()
|
||
|
||
# 动态调整学习率
|
||
scheduler.step()
|
||
|
||
# 验证集评估(自定义逻辑)
|
||
if batch_idx % 100 == 0:
|
||
validate(model, val_dataloader, criterion)
|
||
```
|
||
#### 关键的参数
|
||
1. DataLoader 参数
|
||
```python
|
||
DataLoader(
|
||
dataset=MyDataset(),
|
||
batch_size=64, # 批量大小
|
||
shuffle=True, # 训练时打乱数据顺序
|
||
num_workers=4, # 多线程加载数据
|
||
pin_memory=True # GPU 数据传输加速
|
||
)
|
||
```
|
||
#### 高级技巧
|
||
1. 混合精度训练(Mixed Precision Training)
|
||
```python
|
||
# 混合精度训练
|
||
scaler = torch.cuda.amp.GradScaler()
|
||
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
||
|
||
for epoch in range(num_epochs):
|
||
model.train()
|
||
for inputs, labels in dataloader:
|
||
inputs, labels = inputs.to(device), labels.to(device)
|
||
|
||
# 前向传播
|
||
with torch.cuda.amp.autocast():
|
||
outputs = model(inputs)
|
||
loss = criterion(outputs, labels)
|
||
|
||
# 反向传播和优化
|
||
optimizer.zero_grad()
|
||
scaler.scale(loss).backward()
|
||
scaler.step(optimizer)
|
||
scaler.update()
|
||
```
|
||
2. 分布式训练(Distributed Training)
|
||
```python
|
||
# 分布式训练
|
||
torch.distributed.init_process_group(backend='nccl')
|
||
model = model.to(device)
|
||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device])
|
||
|
||
for epoch in range(num_epochs):
|
||
model.train()
|
||
for inputs, labels in dataloader:
|
||
inputs, labels = inputs.to(device), labels.to(device)
|
||
|
||
# 前向传播
|
||
outputs = model(inputs)
|
||
loss = criterion(outputs, labels)
|
||
|
||
# 反向传播和优化
|
||
optimizer.zero_grad()
|
||
loss.backward()
|
||
optimizer.step()
|
||
```
|
||
3. 梯度检查点(Gradient Checkpointing)
|
||
```python
|
||
# 梯度检查点
|
||
from torch.utils.checkpoint import checkpoint
|
||
|
||
for epoch in range(num_epochs):
|
||
model.train()
|
||
for inputs, labels in dataloader:
|
||
inputs, labels = inputs.to(device), labels.to(device)
|
||
|
||
# 前向传播
|
||
def forward(inputs):
|
||
return model(inputs)
|
||
|
||
outputs = checkpoint(forward, inputs)
|
||
|
||
# 反向传播和优化
|
||
optimizer.zero_grad()
|
||
loss = criterion(outputs, labels)
|
||
loss.backward()
|
||
optimizer.step()
|
||
```
|
||
### 模型评估
|
||
```python
|
||
import torch
|
||
from sklearn.metrics import classification_report
|
||
|
||
def run_evaluation(model, test_loader, class_names):
|
||
model.eval()
|
||
all_preds = []
|
||
all_labels = []
|
||
|
||
with torch.no_grad():
|
||
for inputs, labels in test_loader:
|
||
outputs = model(inputs)
|
||
preds = torch.argmax(outputs, dim=1)
|
||
|
||
all_preds.extend(preds.cpu().numpy())
|
||
all_labels.extend(labels.cpu().numpy())
|
||
|
||
# 打印报告
|
||
print(classification_report(all_labels, all_preds, target_names=class_names))
|
||
|
||
# 返回字典格式结果
|
||
return {
|
||
'accuracy': sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels),
|
||
'classification_report': classification_report(all_labels, all_preds, target_names=class_names)
|
||
}
|
||
```
|
||
### 模型保存与加载
|
||
```python
|
||
# 保存模型
|
||
torch.save(model.state_dict(), 'model.pth')
|
||
|
||
# 加载模型
|
||
model = YourModelClass()
|
||
model.load_state_dict(torch.load('model.pth'))
|
||
model.eval()
|
||
```
|