name: 回归建模 description: 使用线性回归、多项式回归和正则化回归构建预测模型,用于连续预测、趋势预测和关系量化
回归建模
概览
回归建模预测基于输入特征的连续目标值,建立变量之间的定量关系,用于预测和分析。
使用场景
- 预测销售、价格或其他连续数值结果
- 理解独立变量和因变量之间的关系
- 基于历史数据预测趋势
- 量化特征对目标变量的影响
- 构建基线模型以与更复杂的算法比较
- 识别哪些变量对预测影响最大
回归类型
- 线性回归:直线拟合数据
- 多项式回归:非线性关系
- 岭回归(L2):正则化以防止过拟合
- 套索回归(L1):通过正则化进行特征选择
- 弹性网:结合岭回归和套索回归
- 鲁棒回归:对异常值有抵抗力
关键指标
- R² 分数:解释方差的比例
- RMSE:均方根误差
- MAE:平均绝对误差
- AIC/BIC:模型比较标准
Python实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import (
LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
)
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
# 生成样本数据
np.random.seed(42)
X = np.random.uniform(0, 100, 200).reshape(-1, 1)
y = 2.5 * X.squeeze() + 30 + np.random.normal(0, 50, 200)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 线性回归
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("线性回归:")
print(f" R² 分数: {r2_score(y_test, y_pred_lr):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr)):.4f}")
print(f" 系数: {lr_model.coef_[0]:.4f}")
print(f" 截距: {lr_model.intercept_:.4f}")
# 多项式回归(二次)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)
print("
多项式回归(二次):")
print(f" R² 分数: {r2_score(y_test, y_pred_poly):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_poly)):.4f}")
# 岭回归(L2正则化)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
print("
岭回归(alpha=1.0):")
print(f" R² 分数: {r2_score(y_test, y_pred_ridge):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge)):.4f}")
# 套索回归(L1正则化)
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
print("
套索回归(alpha=0.1):")
print(f" R² 分数: {r2_score(y_test, y_pred_lasso):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lasso)):.4f}")
# 弹性网回归
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_model.fit(X_train, y_train)
y_pred_elastic = elastic_model.predict(X_test)
print("
弹性网回归:")
print(f" R² 分数: {r2_score(y_test, y_pred_elastic):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_elastic)):.4f}")
# 鲁棒回归(对异常值有抵抗力)
huber_model = HuberRegressor(max_iter=1000, alpha=0.1)
huber_model.fit(X_train, y_train)
y_pred_huber = huber_model.predict(X_test)
print("
Huber回归(鲁棒):")
print(f" R² 分数: {r2_score(y_test, y_pred_huber):.4f}")
print(f" RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_huber)):.4f}")
# 可视化
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
models_data = [
(X_test, y_test, y_pred_lr, '线性'),
(X_test_poly, y_test, y_pred_poly, '多项式(二次)'),
(X_test, y_test, y_pred_ridge, '岭回归'),
(X_test, y_test, y_pred_lasso, '套索回归'),
(X_test, y_test, y_pred_elastic, '弹性网'),
(X_test, y_test, y_pred_huber, 'Huber'),
]
for idx, (X_p, y_t, y_p, label) in enumerate(models_data):
if label in ['多项式(二次)']:
x_plot = X_p[:, 1] # 用于绘图的二次特征
else:
x_plot = X_p
ax = axes[idx // 3, idx % 3]
ax.scatter(x_plot, y_t, alpha=0.5, label='实际值')
ax.scatter(x_plot, y_p, alpha=0.5, color='red', label='预测值')
ax.set_title(f'{label}
R²={r2_score(y_t, y_p):.4f}')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 残差分析
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
residuals = y_test - y_pred_lr
axes[0].scatter(y_pred_lr, residuals, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_title('残差图')
axes[0].set_xlabel('拟合值')
axes[0].set_ylabel('残差')
axes[1].hist(residuals, bins=20, edgecolor='black')
axes[1].set_title('残差分布')
axes[1].set_xlabel('残差')
axes[1].set_ylabel('频率')
plt.tight_layout()
plt.show()
# 交叉验证
cv_scores = cross_val_score(LinearRegression(), X, y, cv=5, scoring='r2')
print(f"
交叉验证R²分数: {cv_scores}")
print(f"平均CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
# 正则化参数调整
alphas = np.logspace(-3, 3, 100)
ridge_scores = []
for alpha in alphas:
ridge = Ridge(alpha=alpha)
scores = cross_val_score(ridge, X_train, y_train, cv=5, scoring='r2')
ridge_scores.append(scores.mean())
best_alpha_idx = np.argmax(ridge_scores)
best_alpha = alphas[best_alpha_idx]
plt.figure(figsize=(10, 5))
plt.semilogx(alphas, ridge_scores)
plt.axvline(x=best_alpha, color='red', linestyle='--', label=f'最佳alpha={best_alpha:.4f}')
plt.xlabel('Alpha(正则化强度)')
plt.ylabel('交叉验证R²分数')
plt.title('岭回归:Alpha调整')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 特征重要性(系数)
if hasattr(lr_model, 'coef_'):
print(f"
模型系数: {lr_model.coef_}")
# 额外的评估和诊断
# 模型预测区间
from scipy import stats as sp_stats
predictions = lr_model.predict(X_test)
residuals = y_test - predictions
mse = np.mean(residuals**2)
rmse = np.sqrt(mse)
# 预测区间(95%)
n = len(X_test)
p = X_test.shape[1]
dof = n - p - 1
# 预测区间(95%)
n = len(X_test)
p = X_test.shape[1]
dof = n - p - 1
t_val = sp_stats.t.ppf(0.975, dof)
margin = t_val * np.sqrt(mse * (1 + 1/n))
pred_intervals = np.column_stack([
predictions - margin,
predictions + margin
])
print(f"
预测区间(95%):")
print(f"第一个预测: {predictions[0]:.2f} [{pred_intervals[0, 0]:.2f}, {pred_intervals[0, 1]:.2f}]")
# 方差膨胀因子用于多重共线性
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["特征"] = X_test.columns if hasattr(X_test, 'columns') else range(X_test.shape[1])
try:
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("
方差膨胀因子(VIF):")
print(vif_data)
except:
print("VIF计算跳过(特征不足)")
# 按组/段预测
if hasattr(X_test, 'columns'):
segment_results = {}
for feat in X_test.columns[:2]:
q1, q3 = X_test[feat].quantile([0.25, 0.75])
low = X_test[X_test[feat] <= q1]
high = X_test[X_test[feat] >= q3]
if len(low) > 0 and len(high) > 0:
low_pred_rmse = np.sqrt(np.mean((y_test[low.index] - lr_model.predict(low))**2))
high_pred_rmse = np.sqrt(np.mean((y_test[high.index] - lr_model.predict(high))**2))
segment_results[feat] = {
'低RMSE': low_pred_rmse,
'高RMSE': high_pred_rmse,
}
if segment_results:
print(f"
段性能:")
for feat, results in segment_results.items():
print(f" {feat}: 低={results['低RMSE']:.2f}, 高={results['高RMSE']:.2f}")
print("
回归模型评估完成!")
假设检验
- 线性:关系是线性的
- 独立性:观测值是独立的
- 同方差性:误差的方差是恒定的
- 正态性:误差是正态分布的
- 无多重共线性:特征之间不是高度相关的
模型选择
- 简单数据:线性回归
- 非线性模式:多项式回归
- 特征众多:套索回归或弹性网
- 异常值:鲁棒回归
- 防止过拟合:岭回归或弹性网
交付物
- 拟合模型及系数
- R²和RMSE指标
- 残差图和分析
- 交叉验证结果
- 正则化参数调整曲线
- 模型比较总结
- 预测及置信区间