PyTorch基础

学习目标

理解PyTorch的核心数据结构：Tensor（张量）
掌握PyTorch Tensor的创建、操作和基本运算
理解PyTorch的自动求导机制（autograd）
学会使用PyTorch构建简单的神经网络
掌握PyTorch的数据加载和训练流程
了解模型的保存与加载方法

3.1 PyTorch概述

PyTorch是Facebook开发的开源深度学习框架，以其动态计算图（Dynamic Computation Graph）和Python优先的设计理念，成为当前最流行的深度学习框架之一。

3.1.1 安装与导入

# 安装PyTorch（CPU版本）
# pip install torch

# 安装PyTorch（CUDA版本，根据CUDA版本选择）
# pip install torch --index-url https://download.pytorch.org/whl/cu118

# 导入PyTorch
import torch

3.1.2 Tensor概述

Tensor（张量）是PyTorch的核心数据结构，可以理解为多维数组，与NumPy的ndarray非常相似，但Tensor支持GPU加速计算和自动求导。

# 标量（0维张量）
scalar = torch.tensor(3.14)
print(scalar)        # tensor(3.1400)
print(scalar.dim())  # 0

# 向量（1维张量）
vector = torch.tensor([1, 2, 3, 4, 5])
print(vector)        # tensor([1, 2, 3, 4, 5])
print(vector.shape)  # torch.Size([5])

# 矩阵（2维张量）
matrix = torch.tensor([[1, 2, 3],
                      [4, 5, 6]])
print(matrix)
# tensor([[1, 2, 3],
#         [4, 5, 6]])
print(matrix.shape)  # torch.Size([2, 3])

# 3维张量
tensor_3d = torch.randn(2, 3, 4)  # 随机初始化的3维张量
print(tensor_3d.shape)  # torch.Size([2, 3, 4])

3.2 Tensor创建

PyTorch提供了多种创建Tensor的方式。

3.2.1 从数据创建

# 从Python列表创建
a = torch.tensor([1, 2, 3, 4, 5])
print(a)  # tensor([1, 2, 3, 4, 5])

# 从NumPy数组创建
import numpy as np
np_array = np.array([[1, 2, 3], [4, 5, 6]])
torch_tensor = torch.from_numpy(np_array)
print(torch_tensor)
# tensor([[1, 2, 3],
#         [4, 5, 6]])

# Tensor转NumPy
back_to_np = torch_tensor.numpy()
print(back_to_np)
# [[1 2 3]
#  [4 5 6]]

# 注意：共享内存（修改一方会影响另一方）
torch_tensor[0, 0] = 99
print(np_array[0, 0])  # 99

# 不共享内存的转换
copy_tensor = torch_tensor.clone().numpy()

3.2.2 预定义Tensor

# 全0Tensor
zeros_1d = torch.zeros(5)
zeros_2d = torch.zeros(3, 4)
zeros_3d = torch.zeros(2, 3, 4)
print(zeros_2d)
# tensor([[0., 0., 0., 0.],
#         [0., 0., 0., 0.],
#         [0., 0., 0., 0.]])

# 全1Tensor
ones = torch.ones(2, 3)
print(ones)
# tensor([[1., 1., 1.],
#         [1., 1., 1.]])

# 填充指定值
full = torch.full((2, 3), 7.0)
print(full)
# tensor([[7., 7., 7.],
#         [7., 7., 7.]])

# 单位矩阵
eye = torch.eye(4)
print(eye)
# tensor([[1., 0., 0., 0.],
#         [0., 1., 0., 0.],
#         [0., 0., 1., 0.],
#         [0., 0., 0., 1.]])

3.2.3 范围和序列

# arange：类似Python的range
a = torch.arange(10)          # 0到9
print(a)  # tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

b = torch.arange(1, 10)       # 1到9
print(b)  # tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

c = torch.arange(0, 10, 2)    # 0到9，步长2
print(c)  # tensor([0, 2, 4, 6, 8])

# linspace：等差数列
d = torch.linspace(0, 1, 5)   # 0到1之间均匀取5个点
print(d)  # tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])

3.2.4 随机Tensor

# 设置随机种子（结果可复现）
torch.manual_seed(42)

# 均匀分布[0, 1)
uniform = torch.rand(3, 4)
print(uniform)
# tensor([[0.3745, 0.9507, 0.7320, 0.5987],
#         [0.1560, 0.1560, 0.0581, 0.8662],
#         [0.6011, 0.7081, 0.0206, 0.9699]])

# 标准正态分布（均值0，方差1）
normal = torch.randn(3, 4)
print(normal)

# 整数随机Tensor
integers = torch.randint(0, 10, (3, 4))  # 0到9
print(integers)

# 随机打乱
arr = torch.arange(10)
shuffled = arr[torch.randperm(len(arr))]  # 随机排列索引
print(shuffled)

# 生成随机初始化的Tensor，指定尺寸
linear = torch.randn(512, 1024)  # 权重初始化常用

3.3 Tensor属性

x = torch.randn(3, 4, 5)

# shape：形状
print(x.shape)       # torch.Size([3, 4, 5])
print(x.size())      # torch.Size([3, 4, 5])

# ndim/dim：维度数
print(x.dim())       # 3

# dtype：数据类型
print(x.dtype)       # torch.float32

# device：所在设备
print(x.device)      # cpu

# numel：元素总数
print(x.numel())     # 60

# layout：内存布局
print(x.layout)      # torch.strided

Tensor数据类型：

dtype	说明
torch.float32 / torch.float	32位浮点
torch.float64 / torch.double	64位浮点
torch.float16 / torch.half	16位浮点
torch.int8	8位整数
torch.int16 / torch.short	16位整数
torch.int32 / torch.int	32位整数
torch.int64 / torch.long	64位整数
torch.bool	布尔

# 指定数据类型
a = torch.tensor([1, 2, 3], dtype=torch.float32)
print(a.dtype)  # torch.float32

# 类型转换
b = a.to(torch.int32)
print(b.dtype)  # torch.int32

c = a.int()     # 简写形式
print(c.dtype)  # torch.int32

# 与NumPy的类型对应
print(torch.float32 == np.float32)  # True
print(torch.int64 == np.int64)       # True

3.4 Tensor操作

3.4.1 索引与切片

PyTorch的索引和切片语法与NumPy非常相似。

x = torch.arange(12).reshape(3, 4)
print(x)
# tensor([[ 0,  1,  2,  3],
#         [ 4,  5,  6,  7],
#         [ 8,  9, 10, 11]])

# 基本索引
print(x[0])       # tensor([0, 1, 2, 3])
print(x[1, 2])    # tensor(6)
print(x[-1, -1])  # tensor(11)

# 切片
print(x[:, 0])    # tensor([0, 4, 8])（第一列）
print(x[1:, :2])  # tensor([[4, 5], [8, 9]]）（后两行的前两列）
print(x[::2, ::2])  # tensor([[0, 2], [8, 10]])（跳行跳列）

# 布尔索引
mask = x > 5
print(mask)
# tensor([[False, False, False, False],
#         [False, False,  True,  True],
#         [ True,  True,  True,  True]])
print(x[mask])  # tensor([ 6,  7,  8,  9, 10, 11])

# 条件索引
print(x[x > 5])  # tensor([ 6,  7,  8,  9, 10, 11])

3.4.2 形状操作

# reshape：改变形状
x = torch.arange(12)
y = x.reshape(3, 4)
print(y.shape)  # torch.Size([3, 4])

# view：类似reshape，但不复制数据
z = x.view(3, 4)

# flatten：展平为一维
flat = y.flatten()
print(flat.shape)  # torch.Size([12])

# squeeze/unsqueeze：移除/添加单维度
a = torch.randn(1, 3, 1, 4)
print(a.shape)  # torch.Size([1, 3, 1, 4])

b = torch.squeeze(a)  # 移除所有单维度
print(b.shape)  # torch.Size([3, 4])

c = torch.squeeze(a, dim=0)  # 移除指定维度的单维度
print(c.shape)  # torch.Size([3, 1, 4])

d = torch.unsqueeze(b, dim=0)  # 添加单维度
print(d.shape)  # torch.Size([1, 3, 4])

# transpose：转置（2维）
x = torch.randn(3, 4)
y = x.T  # 或 x.transpose(0, 1)
print(y.shape)  # torch.Size([4, 3])

# permute：多维转置
x = torch.randn(2, 3, 4, 5)
y = x.permute(2, 0, 3, 1)  # (2,3,4,5) -> (4,2,5,3)
print(y.shape)  # torch.Size([4, 2, 5, 3])

# repeat：重复Tensor
x = torch.tensor([1, 2, 3])
y = x.repeat(3)  # 重复3次
print(y)  # tensor([1, 2, 3, 1, 2, 3, 1, 2, 3])

z = x.repeat(2, 1)  # 沿第0维重复2次，第1维重复1次
print(z)
# tensor([[1, 2, 3],
#         [1, 2, 3]])

3.4.3 连接与分割

# cat：沿现有轴连接
a = torch.randn(2, 3)
b = torch.randn(2, 3)

c = torch.cat([a, b], dim=0)  # 沿第0维连接
print(c.shape)  # torch.Size([4, 3])

d = torch.cat([a, b], dim=1)  # 沿第1维连接
print(d.shape)  # torch.Size([2, 6])

# stack：沿新轴连接（会创建新维度）
e = torch.stack([a, b], dim=0)
print(e.shape)  # torch.Size([2, 2, 3])

# split：分割
x = torch.randn(8, 4)
# 按大小分割
splits = torch.split(x, [2, 3, 3], dim=0)  # 分割为2,3,3

# chunk：均匀分割
chunks = torch.chunk(x, 4, dim=0)  # 沿第0维均匀分割为4份

# split与chunk对比
print([s.shape for s in splits])  # [torch.Size([2, 4]), torch.Size([3, 4]), torch.Size([3, 4])]
print([c.shape for c in chunks])    # [torch.Size([2, 4]), torch.Size([2, 4]), torch.Size([2, 4]), torch.Size([2, 4])]

3.5 Tensor运算

3.5.1 逐元素运算

a = torch.tensor([1.0, 2.0, 3.0, 4.0])

# 基本算术运算
print(a + 1)     # tensor([2., 3., 4., 5.])
print(a - 1)     # tensor([0., 1., 2., 3.])
print(a * 2)     # tensor([2., 4., 6., 8.])
print(a / 2)     # tensor([0.5000, 1.0000, 1.5000, 2.0000])
print(a ** 2)    # tensor([ 1.,  4.,  9., 16.])

# 逐元素比较
b = torch.tensor([4, 2, 2, 5])
print(a > b)     # tensor([False, False,  True, False])
print(a == b)    # tensor([False,  True, False, False])

# 常用数学函数
x = torch.tensor([0, 30, 60, 90]) * np.pi / 180
print(torch.sin(x))  # 正弦
print(torch.cos(x))  # 余弦
print(torch.exp(x))  # 指数
print(torch.log(x))  # 自然对数
print(torch.sqrt(x)) # 平方根

3.5.2 矩阵运算

# 矩阵乘法
A = torch.randn(3, 4)
B = torch.randn(4, 5)
C = torch.mm(A, B)  # 或 A @ B
print(C.shape)  # torch.Size([3, 5])

# 批量矩阵乘法
A = torch.randn(10, 3, 4)
B = torch.randn(10, 4, 5)
C = torch.bmm(A, B)  # 批量矩阵乘法
print(C.shape)  # torch.Size([10, 3, 5])

# 更通用的批矩阵乘法
C = torch.matmul(A, B)  # 支持广播

# 向量点积
v1 = torch.tensor([1.0, 2.0, 3.0])
v2 = torch.tensor([4.0, 5.0, 6.0])
dot = torch.dot(v1, v2)
print(dot)  # tensor(32.)

# 求范数
x = torch.tensor([3.0, 4.0])
l2_norm = torch.norm(x)  # 默认L2范数
print(l2_norm)  # tensor(5.)

l1_norm = torch.norm(x, p=1)  # L1范数
print(l1_norm)  # tensor(7.)

# 矩阵转置
A = torch.randn(3, 4)
print(A.T.shape)  # torch.Size([4, 3])

3.5.3 归约操作

a = torch.tensor([[1, 2, 3],
                  [4, 5, 6]])

# 求和
print(torch.sum(a))           # tensor(21)
print(torch.sum(a, dim=0))    # tensor([5, 7, 9])（按列）
print(torch.sum(a, dim=1))    # tensor([ 6, 15])（按行）

# 均值
print(torch.mean(a))          # tensor(3.5000)
print(torch.mean(a, dim=0))   # tensor([2.5000, 3.5000, 4.5000])

# 标准差
print(torch.std(a))           # tensor(1.7078)

# 最大最小值
print(torch.min(a))           # tensor(1)
print(torch.max(a))           # tensor(6)
print(torch.argmin(a))        # tensor(0)（索引）
print(torch.argmax(a))        # tensor(5)

# 沿指定维度求最值
print(torch.max(a, dim=0))     # 返回(values, indices)
values, indices = torch.max(a, dim=1)
print(values)   # tensor([3, 6])
print(indices)  # tensor([2, 2])

# 累积操作
print(torch.cumsum(a, dim=0))
# tensor([[1, 2, 3],
#         [5, 7, 9]])

3.5.4 广播机制

PyTorch的广播机制与NumPy类似，但需要注意维度匹配。

# 基本广播
a = torch.randn(3, 4)
b = torch.randn(4)
c = a + b  # b被广播到(3,4)

# 显式添加维度
a = torch.randn(3, 4)
b = torch.randn(3, 1)  # (3,1)广播到(3,4)
c = a + b

# 批量处理示例
images = torch.randn(32, 3, 224, 224)  # batch_size=32, 3通道, 224x224
mean = torch.mean(images, dim=[2, 3])  # (32, 3)
# mean需要reshape为(32, 3, 1, 1)才能与images广播
mean = mean.reshape(32, 3, 1, 1)
normalized = images - mean
print(normalized.shape)  # (32, 3, 224, 224)

3.6 自动求导（autograd）

自动求导（Automatic Differentiation）是PyTorch的核心特性之一，能够自动计算梯度。

3.6.1 requires_grad

默认情况下，Tensor不会追踪计算历史。设置 requires_grad=True 开启梯度追踪。

# 创建需要求导的Tensor
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
print(x.requires_grad)  # True

# 简写方式
x = torch.ones(3, requires_grad=True)

# 从不需要求导的Tensor创建时，可显式指定
y = torch.tensor([1.0, 2.0, 3.0])
z = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
w = y + z  # w.requires_grad 为 True

3.6.2 计算梯度

# 示例：计算 y = x^2 在 x = [1, 2, 3] 处的梯度
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x ** 2

# y = [1, 4, 9]
# dy/dx = 2x = [2, 4, 6]

# backward()自动计算梯度
y.sum().backward()  # 对标量调用，或对非标量y.sum().backward()

print(x.grad)  # tensor([2., 4., 6.])

非标量输出的反向传播：

# 非标量Tensor需要指定gradient参数或先求和
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x ** 2  # y = [1, 4, 9]

# 方式1：求和后反向传播
y.sum().backward()

# 方式2：传入与y形状相同的gradient参数
# y.backward(torch.ones_like(y))  # 效果等同于y.sum().backward()

print(x.grad)  # tensor([2., 4., 6.])

3.6.3 计算图

PyTorch使用动态计算图（Dynamic Computation Graph），每次运算都会构建新的计算图。

# 计算图示例
x = torch.tensor([1.0], requires_grad=True)
y = x + 2       # 创建加法节点
z = y ** 2      # 创建幂运算节点
w = z * 3       # 创建乘法节点

w.backward()
print(x.grad)  # dy/dx = ?

# 手动验证：w = 3*(x+2)^2
# dw/dx = 6*(x+2) = 6*3 = 18
print(x.grad)  # tensor([18.])

叶子节点与依赖节点：

x = torch.tensor([1.0], requires_grad=True)
y = x + 2
z = y ** 2

print(x.is_leaf)   # True（用户创建的）
print(y.is_leaf)   # False（由运算生成）
print(z.is_leaf)   # False

print(x.grad_fn)   # None（叶子节点没有grad_fn）
print(y.grad_fn)   # <AddBackward0>
print(z.grad_fn)   # <PowBackward0>

3.6.4 梯度累积与 detach

计算图会累积梯度，多次反向传播时梯度会累加。

# 梯度累积示例
x = torch.tensor([1.0], requires_grad=True)

# 第一次计算
y = x ** 2
y.backward()
print(x.grad)  # tensor([2.])

# 第二次计算前需要清零
x.grad.zero_()

# 第二次计算
z = x ** 3
z.backward()
print(x.grad)  # tensor([3.])

# 如果不清零，梯度会累加：2 + 3 = 5

detach()：分离计算图：

x = torch.tensor([1.0], requires_grad=True)
y = x ** 2

# 分离y，使z脱离计算图
z = y.detach()
print(z.requires_grad)  # False

# 或者创建不需要梯度的副本
y_no_grad = y.detach().requires_grad_(False)

3.6.5 with torch.no_grad()

在推理（inference）阶段，不需要梯度追踪，使用 no_grad() 可以节省内存和计算。

x = torch.tensor([1.0], requires_grad=True)
y = x ** 2

# 不追踪梯度
with torch.no_grad():
    z = y + 1
    print(z.requires_grad)  # False

# 装饰器形式
@torch.no_grad()
def inference(model, x):
    return model(x)

# 或者全局设置
torch.set_grad_enabled(False)
# ... 推理代码 ...
torch.set_grad_enabled(True)

3.7 神经网络构建

PyTorch通过 torch.nn 模块提供了构建神经网络的工具。

3.7.1 常用层

import torch.nn as nn

# Linear（全连接层）
linear = nn.Linear(in_features=10, out_features=5, bias=True)
input_tensor = torch.randn(3, 10)  # batch_size=3, 输入特征=10
output = linear(input_tensor)
print(output.shape)  # torch.Size([3, 5])

# 权重和偏置
print(linear.weight.shape)  # torch.Size([5, 10])
print(linear.bias.shape)    # torch.Size([5])

# Conv2d（卷积层）
conv = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
# 输入：(batch, channels, height, width)
input_tensor = torch.randn(1, 3, 32, 32)
output = conv(input_tensor)
print(output.shape)  # torch.Size([1, 16, 32, 32])

# MaxPool2d（最大池化）
pool = nn.MaxPool2d(kernel_size=2, stride=2)
output = pool(output)
print(output.shape)  # torch.Size([1, 16, 16, 16])

# AvgPool2d（平均池化）
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
output = avg_pool(output)
print(output.shape)  # torch.Size([1, 16, 8, 8])

3.7.2 激活函数

import torch.nn.functional as F

x = torch.randn(5)

# ReLU
relu = F.relu(x)
print(relu)  # max(0, x)

# Sigmoid
sigmoid = torch.sigmoid(x)
print(sigmoid)  # 1 / (1 + exp(-x))

# Tanh
tanh = torch.tanh(x)
print(tanh)

# Softmax（沿指定维度）
x = torch.randn(2, 5)
softmax = F.softmax(x, dim=1)  # 沿第1维归一化
print(softmax)  # 每行和为1

3.7.3 构建模型

PyTorch有两种构建模型的方式：Sequential和类继承。

方式1：Sequential

import torch.nn as nn

# 简单模型用Sequential
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)

# 测试模型
x = torch.randn(32, 784)  # batch_size=32
output = model(x)
print(output.shape)  # torch.Size([32, 10])

方式2：类继承

import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = SimpleNet()
print(model)
# SimpleNet(
#   (fc1): Linear(in_features=784, out_features=256, bias=True)
#   (fc2): Linear(in_features=256, out_features=128, bias=True)
#   (fc3): Linear(in_features=128, out_features=10, bias=True)
#   (relu): ReLU()
# )

# 访问参数
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

3.7.4 损失函数

import torch.nn as nn

# 交叉熵损失（用于分类）
criterion = nn.CrossEntropyLoss()

# 模拟预测和目标
outputs = torch.randn(4, 10)  # 4个样本，10个类别
targets = torch.tensor([0, 5, 3, 7])  # 真实类别索引

loss = criterion(outputs, targets)
print(f"CrossEntropyLoss: {loss.item():.4f}")

# MSE损失（用于回归）
mse_loss = nn.MSELoss()
predictions = torch.tensor([[1.0], [2.0], [3.0]])
targets = torch.tensor([[1.5], [2.5], [2.5]])
loss = mse_loss(predictions, targets)
print(f"MSELoss: {loss.item():.4f}")

# BCE损失（二分类）
bce_loss = nn.BCELoss()
sigmoid = nn.Sigmoid()
pred = sigmoid(torch.randn(4, 1))
target = torch.tensor([[1.0], [0.0], [1.0], [0.0]])
loss = bce_loss(pred, target)
print(f"BCELoss: {loss.item():.4f}")

3.7.5 优化器

import torch.optim as optim

# 创建模型
model = SimpleNet()

# SGD优化器
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Adam优化器（更常用）
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 典型训练步骤
for data, target in dataloader:
    optimizer.zero_grad()    # 清零梯度
    output = model(data)    # 前向传播
    loss = criterion(output, target)  # 计算损失
    loss.backward()         # 反向传播
    optimizer.step()        # 更新参数

3.8 数据加载

3.8.1 Dataset与DataLoader

from torch.utils.data import Dataset, DataLoader

# 自定义数据集
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# 创建数据集
X = torch.randn(1000, 784)  # 1000个样本
y = torch.randint(0, 10, (1000,))  # 10个类别
dataset = CustomDataset(X, y)

# 创建数据加载器
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)

# 迭代数据
for batch_data, batch_labels in dataloader:
    print(batch_data.shape, batch_labels.shape)
    # torch.Size([32, 784]) torch.Size([32])
    break

3.8.2 torchvision数据集

PyTorch提供了常用的计算机视觉数据集。

import torchvision
import torchvision.transforms as transforms

# 下载并加载MNIST数据集
transform = transforms.Compose([
    transforms.ToTensor(),  # 转换为Tensor
    transforms.Normalize((0.5,), (0.5,))  # 归一化
])

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    download=True,
    transform=transform
)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

3.9 训练流程

完整的PyTorch训练流程如下：

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. 准备数据
X = torch.randn(1000, 20)
y = torch.randn(1000, 1)
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 2. 定义模型
class RegressionNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.layers(x)

model = RegressionNet()
print(model)

# 3. 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 4. 训练循环
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # 设置为训练模式
    running_loss = 0.0

    for batch_X, batch_y in dataloader:
        # 前向传播
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)

        # 反向传播
        optimizer.zero_grad()
        loss.backward()

        # 更新参数
        optimizer.step()

        running_loss += loss.item()

    # 打印epoch平均损失
    avg_loss = running_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# 5. 评估模型
model.eval()  # 设置为评估模式
with torch.no_grad():
    test_X = torch.randn(100, 20)
    test_y = torch.randn(100, 1)
    predictions = model(test_X)
    test_loss = criterion(predictions, test_y)
    print(f"Test Loss: {test_loss.item():.4f}")

3.10 模型保存与加载

3.10.1 保存模型

# 保存整个模型（不推荐，包含整个模型结构）
torch.save(model, 'model.pth')

# 保存模型参数（推荐，轻量且灵活）
torch.save(model.state_dict(), 'model_weights.pth')

# 保存检查点（保存训练状态）
checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}
torch.save(checkpoint, 'checkpoint.pth')

3.10.2 加载模型

# 加载整个模型
loaded_model = torch.load('model.pth')

# 加载模型参数
model = RegressionNet()  # 先创建模型结构
model.load_state_dict(torch.load('model_weights.pth'))

# 加载检查点
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

3.10.3 GPU训练

# 检查GPU是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 将模型和数据移动到GPU
model = model.to(device)

# 训练循环中
for batch_X, batch_y in dataloader:
    batch_X = batch_X.to(device)
    batch_y = batch_y.to(device)

    predictions = model(batch_X)
    loss = criterion(predictions, batch_y)
    # ...

# 从GPU移回CPU
model = model.cpu()
predictions = predictions.cpu()

本章小结

本章介绍了PyTorch的基础知识，主要包括：

Tensor创建：从数据创建、预定义Tensor、范围数组、随机Tensor，以及与NumPy的相互转换。
Tensor属性：shape、dtype、device、numel等属性，以及数据类型转换。
Tensor操作：索引与切片、形状操作（reshape、view、transpose、permute等）、连接与分割。
Tensor运算：逐元素运算、矩阵运算、归约操作、广播机制。
自动求导（autograd）：requires_grad、backward()、计算图、梯度累积、no_grad上下文管理器。
神经网络构建：常用层（Linear、Conv2d、Pool）、激活函数、模型定义方式（Sequential和类继承）、损失函数、优化器。
数据加载：Dataset、DataLoader、torchvision数据集。
训练流程：完整的前向传播、反向传播、参数更新循环。
模型保存与加载：state_dict方式、完整模型、检查点保存、GPU/CPU迁移。

PyTorch的动态计算图和Python化API使其成为深度学习研究和实践的首选框架。本章内容为后续构建更复杂的神经网络模型奠定了基础。

思考与练习

创建两个形状为 (3, 4) 的随机Tensor A和B，实现：
矩阵乘法 A @ B^T
沿第0维计算元素-wise乘积（使用广播）
计算每行的L2范数
实现一个简单的多层感知机（MLP），包含两个隐藏层（每层128个神经元），用于对MNIST手写数字进行分类（10类）。使用ReLU激活函数，输出层使用LogSoftmax。
给定一个自定义数据集类 ImageDataset，包含图像路径和标签，编写数据增强的transforms pipeline，包括：随机水平翻转、随机旋转（-15到15度）、归一化到[-1,1]。
使用PyTorch实现一个学习率调度器（Learning Rate Scheduler），在训练过程中根据epoch数动态调整学习率：
前5个epoch使用固定学习率0.01
之后每个epoch将学习率衰减为原来的一半
实现一个自定义损失函数 FocalLoss，用于处理类别不平衡的分类问题。Focal Loss的公式为：
FL(p_t) = -α_t(1 - p_t)^γ log(p_t)
其中 p_t 是模型预测的正确类别的概率，α 和 γ 是超参数。