模型超参数调优
概览
超参数调优是系统地搜索最佳模型配置参数组合以最大化验证数据上的性能的过程。
使用时机
- 当优化模型性能超出基线配置时
- 当系统比较不同参数组合时
- 当微调具有许多超参数的复杂模型时
- 当寻求偏差、方差和训练时间之间的最佳权衡时
- 当提高模型在验证和测试数据上的泛化能力时
- 当探索神经网络、树模型或集成方法的参数空间时
调优方法
- 网格搜索:参数网格上的穷举搜索
- 随机搜索:参数空间中的随机抽样
- 贝叶斯优化:基于概率模型的搜索
- Hyperband:多保真度优化
- 进化算法:基于遗传算法的搜索
- 基于种群的训练:分布式参数优化
模型类型的超参数
- 树模型:max_depth, min_samples_split, learning_rate
- 神经网络:learning_rate, batch_size, num_layers, dropout
- SVM:C, kernel, gamma
- 集成:n_estimators, max_features, min_samples_leaf
Python 实现
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import optuna
from optuna.samplers import TPESampler
import torch
import torch.nn as nn
from torch.optim import Adam
import time
# 创建数据集
X, y = make_classification(n_samples=2000, n_features=50, n_informative=30,
n_redundant=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("数据集形状:", X_train_scaled.shape, X_test_scaled.shape)
# 1. 网格搜索
print("
=== 1. 网格搜索 ===")
start = time.time()
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=0
)
grid_search.fit(X_train_scaled, y_train)
grid_time = time.time() - start
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳CV分数: {grid_search.best_score_:.4f}")
print(f"测试分数: {grid_search.score(X_test_scaled, y_test):.4f}")
print(f"所需时间: {grid_time:.2f}s")
# 2. 随机搜索
print("
=== 2. 随机搜索 ===")
start = time.time()
param_dist = {
'n_estimators': np.arange(50, 300, 10),
'max_depth': np.arange(5, 30, 1),
'min_samples_split': np.arange(2, 20, 1),
'min_samples_leaf': np.arange(1, 10, 1),
'max_features': ['sqrt', 'log2']
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_dist,
n_iter=20,
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=0
)
random_search.fit(X_train_scaled, y_train)
random_time = time.time() - start
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳CV分数: {random_search.best_score_:.4f}")
print(f"测试分数: {random_search.score(X_test_scaled, y_test):.4f}")
print(f"所需时间: {random_time:.2f}s")
# 3. 贝叶斯优化与Optuna
print("
=== 3. 贝叶斯优化(Optuna) ===")
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'max_depth': trial.suggest_int('max_depth', 5, 30),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
}
model = RandomForestClassifier(**params, random_state=42)
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
return scores.mean()
start = time.time()
sampler = TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=20, show_progress_bar=False)
optuna_time = time.time() - start
best_trial = study.best_trial
print(f"最佳参数: {best_trial.params}")
print(f"最佳CV分数: {best_trial.value:.4f}")
# 训练最终模型以获得最佳参数
best_model = RandomForestClassifier(**best_trial.params, random_state=42)
best_model.fit(X_train_scaled, y_train)
print(f"测试分数: {best_model.score(X_test_scaled, y_test):.4f}")
print(f"所需时间: {optuna_time:.2f}s")
# 4. 梯度提升超参数调优
print("
=== 4. 梯度提升调优 ===")
gb_param_grid = {
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'n_estimators': [100, 200, 300],
'max_depth': [3, 5, 7, 9],
'min_samples_split': [2, 5, 10],
'subsample': [0.8, 0.9, 1.0]
}
gb_search = GridSearchCV(
GradientBoostingClassifier(random_state=42),
gb_param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=0
)
gb_search.fit(X_train_scaled, y_train)
print(f"最佳参数: {gb_search.best_params_}")
print(f"最佳CV分数: {gb_search.best_score_:.4f}")
print(f"测试分数: {gb_search.score(X_test_scaled, y_test):.4f}")
# 5. 神经网络学习率调优
print("
=== 5. 神经网络学习率调优 ===")
class SimpleNN(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(50, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.3)
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = torch.sigmoid(self.fc3(x))
return x
learning_rates = [0.0001, 0.001, 0.01, 0.1]
lr_results = {}
device = torch.device('cpu')
for lr in learning_rates:
model = SimpleNN().to(device)
optimizer = Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)
best_loss = float('inf')
patience = 10
patience_counter = 0
for epoch in range(100):
output = model(X_train_tensor)
loss = criterion(output, y_train_tensor)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if loss.item() < best_loss:
best_loss = loss.item()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
lr_results[lr] = best_loss
print(f"学习率 {lr}: 最佳损失 = {best_loss:.6f}")
# 6. 比较可视化
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 搜索方法比较
methods = ['网格搜索', '随机搜索', '贝叶斯优化']
times = [grid_time, random_time, optuna_time]
scores = [grid_search.best_score_, random_search.best_score_, study.best_value]
x = np.arange(len(methods))
axes[0, 0].bar(x, times, color='steelblue', alpha=0.7)
axes[0, 0].set_ylabel('时间(秒)')
axes[0, 0].set_title('调优方法比较 - 时间')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(methods)
axes[0, 1].bar(x, scores, color='coral', alpha=0.7)
axes[0, 1].set_ylabel('CV准确率')
axes[0, 1].set_title('调优方法比较 - 准确率')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(methods)
axes[0, 1].set_ylim([0.8, 1.0])
# Optuna的超参数重要性
importance_dict = {}
for param_name in study.best_trial.params.keys():
trial_values = []
for trial in study.trials:
if param_name in trial.params:
trial_values.append(trial.value)
if trial_values:
importance_dict[param_name] = np.std(trial_values)
axes[1, 0].barh(list(importance_dict.keys()), list(importance_dict.values()),
color='lightgreen', edgecolor='black')
axes[1, 0].set_xlabel('重要性(标准差)')
axes[1, 0].set_title('超参数重要性')
# 神经网络学习率调优
axes[1, 1].plot(list(lr_results.keys()), list(lr_results.values()), marker='o',
linewidth=2, markersize=8, color='purple')
axes[1, 1].set_xlabel('学习率')
axes[1, 1].set_ylabel('最佳训练损失')
axes[1, 1].set_title('学习率对神经网络的影响')
axes[1, 1].set_xscale('log')
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('hyperparameter_tuning.png', dpi=100, bbox_inches='tight')
print("
可视化保存为 'hyperparameter_tuning.png'")
print("
超参数调优完成!")
模型调优策略
- 树模型:关注深度、min_samples、max_features
- 提升:learning_rate、n_estimators、subsample
- 神经网络:学习率、批量大小、正则化
- SVM:C和核类型最为重要
最佳实践
- 对连续参数的搜索空间进行对数缩放
- 使用交叉验证以获得稳健的估计
- 从随机搜索开始进行初步探索
- 使用贝叶斯优化进行最终细化
- 监控收益递减
交付物
- 找到的最佳超参数
- 顶级配置的性能指标
- 调优效率分析
- 参数影响的可视化
- 调优报告和建议