实验目标

本实验将帮助你实现成员推理攻击，判断特定数据是否被用于模型训练。

实验内容

实验 4.2：成员推理攻击

实验目标

- 理解成员推理攻击的原理
- 实现基于置信度的成员推理
- 观察过拟合对攻击成功率的影响

实验环境

- Python 3.8+
- PyTorch
- numpy, matplotlib

预计时间：30 分钟

---

核心概念回顾

成员推理攻击：判断一个样本是否被用于训练模型。模型对训练数据通常表现出更高的置信度。

第一部分：环境准备

In [ ]:

# 导入必要的库
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, random_split

# 设置随机种子
torch.manual_seed(42)
np.random.seed(42)

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

print("环境准备完成！")

In [ ]:

# 创建一个简单的分类数据集
def create_dataset(n_samples=1000, n_features=20, n_classes=5):
    """
    创建一个简单的分类数据集
    """
    X = np.random.randn(n_samples, n_features).astype(np.float32)
    # 为每个类别创建中心点
    centers = np.random.randn(n_classes, n_features) * 2
    y = np.random.randint(0, n_classes, n_samples)
    # 将样本向对应类别中心偏移
    for i in range(n_samples):
        X[i] += centers[y[i]]
    return torch.tensor(X), torch.tensor(y)

# 创建数据集
X, y = create_dataset(n_samples=1000)
print(f"数据集大小: {X.shape[0]} 样本, {X.shape[1]} 特征, {len(torch.unique(y))} 类别")

# 划分训练集和测试集（非成员）
train_size = 500
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

print(f"训练集（成员）: {len(X_train)} 样本")
print(f"测试集（非成员）: {len(X_test)} 样本")

In [ ]:

# 定义简单的神经网络分类器
class SimpleClassifier(nn.Module):
    def __init__(self, n_features=20, n_classes=5, hidden_size=64):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(n_features, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_classes)
        )
    
    def forward(self, x):
        return self.network(x)

print("模型定义完成！")

第二部分：训练目标模型

In [ ]:

# 训练函数
def train_model(model, X_train, y_train, epochs=50, lr=0.01):
    """
    训练模型
    """
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    losses = []
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
        if (epoch + 1) % 10 == 0:
            acc = (outputs.argmax(dim=1) == y_train).float().mean()
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Acc: {acc:.2%}")
    
    return losses

# 训练一个过拟合的模型（用于演示成员推理效果）
print("训练目标模型（故意过拟合以演示攻击效果）...\n")
model = SimpleClassifier()
losses = train_model(model, X_train, y_train, epochs=100)

In [ ]:

# 评估模型在训练集和测试集上的准确率
model.eval()
with torch.no_grad():
    train_acc = (model(X_train).argmax(dim=1) == y_train).float().mean()
    test_acc = (model(X_test).argmax(dim=1) == y_test).float().mean()

print(f"\n模型性能：")
print(f"  训练集准确率: {train_acc:.2%}")
print(f"  测试集准确率: {test_acc:.2%}")
print(f"  过拟合程度: {(train_acc - test_acc):.2%}")

if train_acc - test_acc > 0.1:
    print("\n⚠️ 模型存在过拟合，成员推理攻击更容易成功！")

第三部分：实现成员推理攻击

In [ ]:

# 【填空 1】实现基于置信度的成员推理
# 提示：计算模型对样本预测的最大置信度

def get_confidence(model, X, y):
    """
    获取模型对每个样本的预测置信度
    返回：每个样本在其真实类别上的置信度
    """
    model.eval()
    with torch.no_grad():
        outputs = model(X)
        # 计算 softmax 概率
        probs = torch.softmax(outputs, dim=1)
        
        # 【填空 1】获取每个样本在真实类别上的置信度
        # 提示：使用 gather 或索引获取对应类别的概率
        # 参考答案：confidences = probs.gather(1, y.unsqueeze(1)).squeeze()
        confidences = ___________________
    
    return confidences.numpy()

# 获取训练集（成员）和测试集（非成员）的置信度
train_confidences = get_confidence(model, X_train, y_train)
test_confidences = get_confidence(model, X_test, y_test)

print(f"训练集（成员）平均置信度: {np.mean(train_confidences):.4f}")
print(f"测试集（非成员）平均置信度: {np.mean(test_confidences):.4f}")

In [ ]:

# 可视化置信度分布
plt.figure(figsize=(10, 5))

plt.hist(train_confidences, bins=30, alpha=0.6, label='训练集（成员）', color='coral')
plt.hist(test_confidences, bins=30, alpha=0.6, label='测试集（非成员）', color='steelblue')

plt.xlabel('置信度')
plt.ylabel('样本数量')
plt.title('成员 vs 非成员的置信度分布')
plt.legend()
plt.axvline(x=0.5, color='gray', linestyle='--', label='可能的阈值')
plt.tight_layout()
plt.show()

print("观察：成员样本的置信度分布偏向右侧（更高）")

In [ ]:

# 【填空 2】实现成员推理攻击函数
# 提示：使用阈值判断，置信度高于阈值的判断为成员

def membership_inference_attack(model, X, y, threshold):
    """
    执行成员推理攻击
    返回：预测结果（True=成员，False=非成员）
    """
    confidences = get_confidence(model, X, y)
    
    # 【填空 2】根据阈值判断成员身份
    # 提示：置信度 > 阈值 → 预测为成员
    # 参考答案：predictions = confidences > threshold
    predictions = ___________________
    
    return predictions

# 选择阈值（使用训练集和测试集置信度的中位数）
threshold = (np.median(train_confidences) + np.median(test_confidences)) / 2
print(f"选择的攻击阈值: {threshold:.4f}")

In [ ]:

# 【填空 3】评估攻击效果
# 提示：计算对成员和非成员的预测准确率

# 对训练集（真实成员）执行攻击
train_predictions = membership_inference_attack(model, X_train, y_train, threshold)
# 对测试集（真实非成员）执行攻击
test_predictions = membership_inference_attack(model, X_test, y_test, threshold)

# 【填空 3】计算攻击准确率
# 对成员：预测为 True 是正确的
# 对非成员：预测为 False 是正确的
# 参考答案：
# member_acc = np.mean(train_predictions == True)
# non_member_acc = np.mean(test_predictions == False)

member_acc = ___________________
non_member_acc = np.mean(test_predictions == False)

overall_acc = (member_acc + non_member_acc) / 2

print(f"\n成员推理攻击结果：")
print(f"  成员识别率（真阳率）: {member_acc:.2%}")
print(f"  非成员识别率（真阴率）: {non_member_acc:.2%}")
print(f"  总体准确率: {overall_acc:.2%}")

if overall_acc > 0.6:
    print(f"\n⚠️ 攻击成功！准确率显著高于随机猜测(50%)")
else:
    print(f"\n✓ 攻击效果有限，接近随机猜测")

第四部分：阈值对攻击效果的影响

In [ ]:

# 测试不同阈值的攻击效果
thresholds = np.linspace(0.1, 0.95, 20)
results = []

for thresh in thresholds:
    train_pred = membership_inference_attack(model, X_train, y_train, thresh)
    test_pred = membership_inference_attack(model, X_test, y_test, thresh)
    
    tpr = np.mean(train_pred == True)   # 真阳率
    fpr = np.mean(test_pred == True)    # 假阳率
    acc = (tpr + (1 - fpr)) / 2         # 平衡准确率
    
    results.append((thresh, tpr, fpr, acc))

# 可视化
thresholds_list = [r[0] for r in results]
tpr_list = [r[1] for r in results]
fpr_list = [r[2] for r in results]
acc_list = [r[3] for r in results]

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(thresholds_list, tpr_list, 'b-', label='真阳率（成员识别）')
plt.plot(thresholds_list, fpr_list, 'r-', label='假阳率（误判非成员）')
plt.xlabel('阈值')
plt.ylabel('比率')
plt.title('阈值 vs 真阳率/假阳率')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(thresholds_list, acc_list, 'g-', linewidth=2)
plt.axhline(y=0.5, color='gray', linestyle='--', label='随机猜测')
plt.xlabel('阈值')
plt.ylabel('平衡准确率')
plt.title('阈值 vs 攻击准确率')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# 找到最佳阈值
best_idx = np.argmax(acc_list)
print(f"最佳阈值: {thresholds_list[best_idx]:.3f}, 最高准确率: {acc_list[best_idx]:.2%}")

第五部分：过拟合程度对攻击的影响

In [ ]:

# 对比不同训练程度的模型
epoch_values = [10, 50, 100, 200]
attack_results = []

print("测试不同训练程度对攻击效果的影响...\n")

for epochs in epoch_values:
    # 重新训练模型
    temp_model = SimpleClassifier()
    train_model(temp_model, X_train, y_train, epochs=epochs, lr=0.01)
    
    # 评估过拟合程度
    temp_model.eval()
    with torch.no_grad():
        train_acc = (temp_model(X_train).argmax(dim=1) == y_train).float().mean().item()
        test_acc = (temp_model(X_test).argmax(dim=1) == y_test).float().mean().item()
    
    # 执行攻击
    train_conf = get_confidence(temp_model, X_train, y_train)
    test_conf = get_confidence(temp_model, X_test, y_test)
    best_thresh = (np.median(train_conf) + np.median(test_conf)) / 2
    
    train_pred = train_conf > best_thresh
    test_pred = test_conf > best_thresh
    attack_acc = (np.mean(train_pred) + np.mean(~test_pred)) / 2
    
    attack_results.append({
        'epochs': epochs,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'overfit': train_acc - test_acc,
        'attack_acc': attack_acc
    })
    print(f"Epochs={epochs}: 过拟合={train_acc-test_acc:.1%}, 攻击准确率={attack_acc:.1%}")

In [ ]:

# 可视化过拟合与攻击成功率的关系
overfits = [r['overfit'] for r in attack_results]
attack_accs = [r['attack_acc'] for r in attack_results]

plt.figure(figsize=(8, 5))
plt.scatter(overfits, attack_accs, s=100, c='coral')

for i, r in enumerate(attack_results):
    plt.annotate(f"{r['epochs']} epochs", 
                 (overfits[i], attack_accs[i]),
                 textcoords="offset points",
                 xytext=(10, 5))

plt.xlabel('过拟合程度（训练准确率 - 测试准确率）')
plt.ylabel('攻击准确率')
plt.title('过拟合程度 vs 成员推理攻击准确率')
plt.axhline(y=0.5, color='gray', linestyle='--', label='随机猜测')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

print("\n结论：过拟合程度越高，成员推理攻击越容易成功！")

实验总结

观察记录

请回答以下问题：

1. 成员和非成员的置信度分布有什么区别？ 为什么会有这种区别？

2. 过拟合如何影响攻击成功率？ 从模型对训练数据的"记忆"角度解释。

3. 如何防御成员推理攻击？ 基于实验观察，提出可能的防御思路。

核心概念回顾

- 成员推理：判断样本是否在训练集中
- 置信度差异：模型对训练数据更"自信"
- 过拟合影响：过拟合越严重，攻击越容易
- 防御思路：减少过拟合、添加噪声、限制输出信息

---

下一个实验：实验 4.3 差分隐私对比

实验总结

完成检查

完成本实验后，你应该已经：

理解成员推理攻击的原理
实现了基于阈值和影子模型的攻击方法
分析了攻击成功率的影响因素

实验 4.2：成员推理

实验目标

实验内容

实验 4.2：成员推理攻击

实验目标

实验环境

预计时间：30 分钟

核心概念回顾

第一部分：环境准备

第二部分：训练目标模型

第三部分：实现成员推理攻击

第四部分：阈值对攻击效果的影响

第五部分：过拟合程度对攻击的影响

实验总结

观察记录

核心概念回顾

实验总结

目录导航