模块四 隐私攻击实验
实验 4.3:差分隐私
应用差分隐私技术保护模型隐私
实验目标
本实验将帮助你理解和应用差分隐私技术来保护 AI 模型的隐私。
实验内容
实验 4.3:差分隐私对比
实验目标
- 理解差分隐私训练的基本原理
- 对比普通训练和差分隐私训练的模型
- 观察隐私预算(ε)对模型性能和隐私保护的影响
实验环境
- Python 3.8+
- PyTorch
- numpy, matplotlib
预计时间:30 分钟
---
核心概念回顾
差分隐私通过在训练过程中添加噪声,保护单个训练样本的隐私。核心操作:梯度裁剪 + 噪声添加。
第一部分:环境准备
In [ ]:
# 导入必要的库
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
print("环境准备完成!")In [ ]:
# 创建数据集
def create_dataset(n_samples=1000, n_features=20, n_classes=5):
X = np.random.randn(n_samples, n_features).astype(np.float32)
centers = np.random.randn(n_classes, n_features) * 2
y = np.random.randint(0, n_classes, n_samples)
for i in range(n_samples):
X[i] += centers[y[i]]
return torch.tensor(X), torch.tensor(y)
# 创建并划分数据集
X, y = create_dataset(n_samples=1000)
train_size = 500
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]
print(f"训练集: {len(X_train)} 样本")
print(f"测试集: {len(X_test)} 样本")In [ ]:
# 定义简单的分类器
class SimpleClassifier(nn.Module):
def __init__(self, n_features=20, n_classes=5, hidden_size=64):
super().__init__()
self.network = nn.Sequential(
nn.Linear(n_features, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, n_classes)
)
def forward(self, x):
return self.network(x)
print("模型定义完成!")第二部分:实现简化版差分隐私训练
差分隐私 SGD(DP-SGD)的核心操作:
1. 梯度裁剪:限制每个样本的梯度范数
2. 噪声添加:在聚合梯度上添加高斯噪声
In [ ]:
# 【填空 1】实现梯度裁剪函数
# 提示:如果梯度范数超过阈值,按比例缩小
def clip_gradient(gradient, max_norm):
"""
裁剪梯度,确保范数不超过 max_norm
参数:
gradient: 梯度张量
max_norm: 最大范数阈值
返回:
裁剪后的梯度
"""
# 计算梯度的 L2 范数
grad_norm = torch.norm(gradient)
# 【填空 1】如果范数超过阈值,进行裁剪
# 提示:clipped = gradient * (max_norm / grad_norm) if grad_norm > max_norm
# 参考答案:
# if grad_norm > max_norm:
# gradient = gradient * (max_norm / grad_norm)
if grad_norm > max_norm:
gradient = ___________________
return gradient
# 测试
test_grad = torch.randn(10) * 5
print(f"裁剪前范数: {torch.norm(test_grad):.2f}")
clipped = clip_gradient(test_grad.clone(), max_norm=1.0)
print(f"裁剪后范数: {torch.norm(clipped):.2f}")In [ ]:
# 【填空 2】实现差分隐私训练函数
# 提示:在每个 batch 的梯度上添加高斯噪声
def train_with_dp(model, X_train, y_train, epochs=50, lr=0.01,
max_grad_norm=1.0, noise_multiplier=1.0):
"""
使用差分隐私训练模型
参数:
max_grad_norm: 梯度裁剪阈值
noise_multiplier: 噪声强度(相对于裁剪阈值)
"""
optimizer = optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
losses = []
n_samples = len(X_train)
for epoch in range(epochs):
model.train()
# 对每个样本单独计算梯度(简化版,实际应用中使用批处理)
accumulated_grads = {name: torch.zeros_like(param)
for name, param in model.named_parameters()}
for i in range(n_samples):
optimizer.zero_grad()
output = model(X_train[i:i+1])
loss = criterion(output, y_train[i:i+1])
loss.backward()
# 裁剪并累加每个样本的梯度
for name, param in model.named_parameters():
if param.grad is not None:
clipped_grad = clip_gradient(param.grad.clone(), max_grad_norm)
accumulated_grads[name] += clipped_grad
# 计算平均梯度并添加噪声
for name, param in model.named_parameters():
avg_grad = accumulated_grads[name] / n_samples
# 【填空 2】添加高斯噪声
# 噪声标准差 = max_grad_norm * noise_multiplier / n_samples
# 参考答案:noise = torch.randn_like(avg_grad) * (max_grad_norm * noise_multiplier / n_samples)
noise_std = max_grad_norm * noise_multiplier / n_samples
noise = ___________________
# 应用带噪声的梯度
param.data -= lr * (avg_grad + noise)
# 记录损失
with torch.no_grad():
outputs = model(X_train)
epoch_loss = criterion(outputs, y_train).item()
losses.append(epoch_loss)
if (epoch + 1) % 10 == 0:
acc = (outputs.argmax(dim=1) == y_train).float().mean()
print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Acc: {acc:.2%}")
return losses
print("差分隐私训练函数定义完成!")In [ ]:
# 普通训练函数(用于对比)
def train_normal(model, X_train, y_train, epochs=50, lr=0.01):
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
losses = []
for epoch in range(epochs):
model.train()
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
losses.append(loss.item())
if (epoch + 1) % 10 == 0:
acc = (outputs.argmax(dim=1) == y_train).float().mean()
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Acc: {acc:.2%}")
return losses第三部分:对比普通训练和差分隐私训练
In [ ]:
# 训练普通模型
print("=" * 50)
print("训练普通模型(无隐私保护)")
print("=" * 50)
normal_model = SimpleClassifier()
normal_losses = train_normal(normal_model, X_train, y_train, epochs=50)In [ ]:
# 训练差分隐私模型
print("\n" + "=" * 50)
print("训练差分隐私模型(噪声强度=1.0)")
print("=" * 50)
dp_model = SimpleClassifier()
dp_losses = train_with_dp(dp_model, X_train, y_train, epochs=50,
noise_multiplier=1.0, max_grad_norm=1.0)In [ ]:
# 【填空 3】评估两个模型的性能
# 提示:分别计算训练集和测试集的准确率
def evaluate_model(model, X, y):
model.eval()
with torch.no_grad():
outputs = model(X)
# 【填空 3】计算准确率
# 参考答案:acc = (outputs.argmax(dim=1) == y).float().mean().item()
acc = ___________________
return acc
print("\n模型性能对比:")
print("-" * 50)
normal_train_acc = evaluate_model(normal_model, X_train, y_train)
normal_test_acc = evaluate_model(normal_model, X_test, y_test)
print(f"普通模型 - 训练准确率: {normal_train_acc:.2%}, 测试准确率: {normal_test_acc:.2%}")
dp_train_acc = evaluate_model(dp_model, X_train, y_train)
dp_test_acc = evaluate_model(dp_model, X_test, y_test)
print(f"DP模型 - 训练准确率: {dp_train_acc:.2%}, 测试准确率: {dp_test_acc:.2%}")
print(f"\n准确率下降: {(normal_test_acc - dp_test_acc):.2%}")第四部分:差分隐私对成员推理的防御效果
In [ ]:
# 成员推理攻击函数
def membership_inference(model, X_train, y_train, X_test, y_test):
"""
执行成员推理攻击并返回攻击准确率
"""
model.eval()
with torch.no_grad():
# 获取置信度
train_probs = torch.softmax(model(X_train), dim=1)
test_probs = torch.softmax(model(X_test), dim=1)
train_conf = train_probs.gather(1, y_train.unsqueeze(1)).squeeze().numpy()
test_conf = test_probs.gather(1, y_test.unsqueeze(1)).squeeze().numpy()
# 使用中位数作为阈值
threshold = (np.median(train_conf) + np.median(test_conf)) / 2
# 计算攻击准确率
member_correct = np.mean(train_conf > threshold)
non_member_correct = np.mean(test_conf <= threshold)
attack_acc = (member_correct + non_member_correct) / 2
return attack_acc, train_conf, test_conf
# 对两个模型执行攻击
normal_attack_acc, normal_train_conf, normal_test_conf = membership_inference(
normal_model, X_train, y_train, X_test, y_test)
dp_attack_acc, dp_train_conf, dp_test_conf = membership_inference(
dp_model, X_train, y_train, X_test, y_test)
print("成员推理攻击结果:")
print("-" * 50)
print(f"普通模型 - 攻击准确率: {normal_attack_acc:.2%}")
print(f"DP模型 - 攻击准确率: {dp_attack_acc:.2%}")
print(f"\n防御效果: 攻击准确率下降 {(normal_attack_acc - dp_attack_acc):.2%}")
if dp_attack_acc < 0.55:
print("✓ 差分隐私有效降低了成员推理攻击的成功率!")In [ ]:
# 可视化置信度分布对比
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 普通模型
axes[0].hist(normal_train_conf, bins=25, alpha=0.6, label='成员', color='coral')
axes[0].hist(normal_test_conf, bins=25, alpha=0.6, label='非成员', color='steelblue')
axes[0].set_xlabel('置信度')
axes[0].set_ylabel('样本数量')
axes[0].set_title(f'普通模型\n攻击准确率: {normal_attack_acc:.1%}')
axes[0].legend()
# DP模型
axes[1].hist(dp_train_conf, bins=25, alpha=0.6, label='成员', color='coral')
axes[1].hist(dp_test_conf, bins=25, alpha=0.6, label='非成员', color='steelblue')
axes[1].set_xlabel('置信度')
axes[1].set_ylabel('样本数量')
axes[1].set_title(f'差分隐私模型\n攻击准确率: {dp_attack_acc:.1%}')
axes[1].legend()
plt.suptitle('成员 vs 非成员置信度分布对比', fontsize=14)
plt.tight_layout()
plt.show()
print("观察:DP模型的成员和非成员置信度分布更加重叠,更难区分")第五部分:不同噪声强度的影响
In [ ]:
# 测试不同噪声强度
noise_multipliers = [0.0, 0.5, 1.0, 2.0, 4.0]
results = []
print("测试不同噪声强度...\n")
for noise_mult in noise_multipliers:
# 训练模型
temp_model = SimpleClassifier()
if noise_mult == 0:
train_normal(temp_model, X_train, y_train, epochs=50)
else:
train_with_dp(temp_model, X_train, y_train, epochs=50,
noise_multiplier=noise_mult)
# 评估
test_acc = evaluate_model(temp_model, X_test, y_test)
attack_acc, _, _ = membership_inference(temp_model, X_train, y_train, X_test, y_test)
results.append({
'noise': noise_mult,
'test_acc': test_acc,
'attack_acc': attack_acc
})
print(f"噪声={noise_mult}: 测试准确率={test_acc:.1%}, 攻击准确率={attack_acc:.1%}")In [ ]:
# 可视化隐私-效用权衡
noise_list = [r['noise'] for r in results]
test_acc_list = [r['test_acc'] for r in results]
attack_acc_list = [r['attack_acc'] for r in results]
fig, ax1 = plt.subplots(figsize=(10, 5))
color1 = 'steelblue'
ax1.set_xlabel('噪声强度(noise_multiplier)')
ax1.set_ylabel('模型测试准确率', color=color1)
ax1.plot(noise_list, test_acc_list, 'o-', color=color1, linewidth=2, markersize=8, label='测试准确率')
ax1.tick_params(axis='y', labelcolor=color1)
ax2 = ax1.twinx()
color2 = 'coral'
ax2.set_ylabel('成员推理攻击准确率', color=color2)
ax2.plot(noise_list, attack_acc_list, 's--', color=color2, linewidth=2, markersize=8, label='攻击准确率')
ax2.tick_params(axis='y', labelcolor=color2)
ax2.axhline(y=0.5, color='gray', linestyle=':', label='随机猜测')
plt.title('隐私-效用权衡:噪声强度的影响')
fig.tight_layout()
plt.show()
print("\n结论:")
print("- 噪声越大 → 隐私保护越强(攻击准确率越低)")
print("- 噪声越大 → 模型效用越低(测试准确率越低)")
print("- 需要根据应用场景选择合适的平衡点")实验总结
观察记录
请回答以下问题:
1. 差分隐私如何影响模型性能? 准确率下降了多少?这个代价可接受吗?
2. 差分隐私对成员推理的防御效果如何? 攻击准确率下降了多少?
3. 如何选择合适的噪声强度? 在隐私保护和模型效用之间如何权衡?
核心概念回顾
- 梯度裁剪:限制单个样本的影响
- 噪声添加:隐藏单个样本的贡献
- 隐私-效用权衡:更强隐私 = 更低效用
- 防御效果:DP训练可有效降低成员推理攻击成功率
---
模块四实验完成! 你已经学习了训练数据提取、成员推理攻击和差分隐私防御的基础知识。
实验总结
完成检查
完成本实验后,你应该已经:
- 理解差分隐私的基本原理
- 实现了 DP-SGD 训练方法
- 分析了隐私预算对模型效用的影响