生存分析
概览
生存分析研究事件发生前的时间,处理部分受试者未观察到事件的截尾数据,从而预测寿命和风险评估。
核心概念
- 生存时间:事件发生前的时间
- 截尾:未观察到事件(受试者退出)
- 风险:在时间t的即时风险
- 生存曲线:在时间t之后存活的概率
- 风险比:不同组之间的相对风险
常见模型
- Kaplan-Meier:非参数生存曲线
- Cox比例风险模型:半参数回归
- Weibull/Exponential:参数模型
- Log-rank Test:比较生存曲线
- Competing Risks:多种事件类型
使用Python实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter, WeibullAFTFitter
from lifelines.statistics import logrank_test
import warnings
warnings.filterwarnings('ignore')
# 生成样本生存数据
np.random.seed(42)
n_patients = 200
# 事件发生时间(以月为单位)
event_times = np.random.exponential(scale=24, size=n_patients)
# 截尾指示器(1 = 事件发生,0 = 截尾)
event_observed = np.random.binomial(1, 0.7, n_patients)
# 组分配(0 = 对照组,1 = 治疗组)
group = np.random.binomial(1, 0.5, n_patients)
# 基线年龄
age = np.random.uniform(30, 80, n_patients)
# 风险评分
risk_score = np.random.uniform(0, 100, n_patients)
# 根据组调整事件时间(模拟治疗效果)
event_times = event_times * (1 + group * 0.3)
df = pd.DataFrame({
'time': event_times,
'event': event_observed,
'group': group,
'age': age,
'risk_score': risk_score,
})
print("生存数据摘要:")
print(df.head(10))
print(f"
总受试者:{len(df)}")
print(f"事件:{df['event'].sum()} ({df['event'].sum()/len(df)*100:.1f}%)")
print(f"截尾:{(1-df['event']).sum()} ({(1-df['event']).sum()/len(df)*100:.1f}%)")
# 1. Kaplan-Meier 估计
kmf = KaplanMeierFitter()
kmf.fit(df['time'], df['event'], label='Overall')
print("
1. Kaplan-Meier 生存估计:")
print(f"中位生存时间:{kmf.median_survival_time_:.1f} 月")
print(f"6个月生存:{kmf.predict(6):.1%}")
print(f"12个月生存:{kmf.predict(12):.1%}")
print(f"24个月生存:{kmf.predict(24):.1%}")
# 2. 组比较
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 总体生存曲线
ax = axes[0, 0]
kmf.plot_survival_function(ax=ax, linewidth=2)
ax.set_xlabel('时间(月)')
ax.set_ylabel('生存概率')
ax.set_title('Kaplan-Meier 生存曲线(总体)')
ax.grid(True, alpha=0.3)
# 按组生存曲线
ax = axes[0, 1]
for group_val in [0, 1]:
mask = df['group'] == group_val
kmf.fit(df[mask]['time'], df[mask]['event'],
label=f'{"对照组" if group_val == 0 else "治疗组"}')
kmf.plot_survival_function(ax=ax, linewidth=2)
ax.set_xlabel('时间(月)')
ax.set_ylabel('生存概率')
ax.set_title('按组Kaplan-Meier曲线')
ax.grid(True, alpha=0.3)
# 3. Log-Rank 测试
mask_control = df['group'] == 0
mask_treatment = df['group'] == 1
results = logrank_test(
df[mask_control]['time'],
df[mask_treatment]['time'],
df[mask_control]['event'],
df[mask_treatment]['event']
)
print(f"
3. Log-Rank 测试:")
print(f"测试统计量:{results.test_statistic:.4f}")
print(f"P值:{results.p_value:.4f}")
print(f"显著性:{'是' if results.p_value < 0.05 else '否'}")
# 4. 风险组(按四分位数)
df['risk_quartile'] = pd.qcut(df['risk_score'], q=4, labels=['低', '中低', '中高', '高'])
ax = axes[1, 0]
for risk_group in ['低', '中低', '中高', '高']:
mask = df['risk_quartile'] == risk_group
kmf.fit(df[mask]['time'], df[mask]['event'], label=risk_group)
kmf.plot_survival_function(ax=ax, linewidth=2)
ax.set_xlabel('时间(月)')
ax.set_ylabel('生存概率')
ax.set_title('按风险四分位数Kaplan-Meier曲线')
ax.legend()
ax.grid(True, alpha=0.3)
# 5. 累积风险
ax = axes[1, 1]
kmf.fit(df['time'], df['event'])
kmf.plot_cumulative_density(ax=ax, linewidth=2)
ax.set_xlabel('时间(月)')
ax.set_ylabel('累积事件概率')
ax.set_title('累积事件概率')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 6. Cox比例风险模型
cph = CoxPHFitter()
cph.fit(df[['time', 'event', 'group', 'age', 'risk_score']], duration_col='time', event_col='event')
print(f"
6. Cox比例风险模型:")
print(cph.summary)
# 风险比
print(f"
风险比:")
for var in ['group', 'age', 'risk_score']:
hr = np.exp(cph.params_[var])
print(f" {var}: {hr:.3f}")
# 7. 模型诊断
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# 部分效应图
ax = axes[0, 0]
df_partial = df.copy()
df_partial['partial_hazard'] = cph.predict_partial_hazard(df_partial)
for group_val in [0, 1]:
mask = df_partial['group'] == group_val
ax.scatter(df_partial[mask]['risk_score'], df_partial[mask]['partial_hazard'],
alpha=0.6, label=f'{"对照组" if group_val == 0 else "治疗组"}')
ax.set_xlabel('风险评分')
ax.set_ylabel('部分风险')
ax.set_title('按风险评分和组的部分风险')
ax.legend()
ax.grid(True, alpha=0.3)
# 一致性指数随时间变化
ax = axes[0, 1]
concordance_index = cph.concordance_index_
ax.text(0.5, 0.5, f'一致性指数:{concordance_index:.3f}',
ha='center', va='center', fontsize=14,
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
ax.axis('off')
ax.set_title('模型性能')
# 按预测风险的生存曲线
ax = axes[1, 0]
df['predicted_hazard'] = cph.predict_partial_hazard(df)
df['hazard_quartile'] = pd.qcut(df['predicted_hazard'], q=4, labels=['低', '中低', '中高', '高'])
for hazard_group in ['低', '中低', '中高', '高']:
mask = df['hazard_quartile'] == hazard_group
kmf.fit(df[mask]['time'], df[mask]['event'], label=hazard_group)
kmf.plot_survival_function(ax=ax, linewidth=2)
ax.set_xlabel('时间(月)')
ax.set_ylabel('生存概率')
ax.set_title('按预测风险四分位数的生存曲线')
ax.grid(True, alpha=0.3)
# 变量重要性
ax = axes[1, 1]
coef_df = cph.summary[['coef', 'exp(coef)']].copy()
coef_df = coef_df.sort_values('coef')
colors = ['red' if x < 0 else 'green' for x in coef_df['coef']]
ax.barh(coef_df.index, coef_df['coef'], color=colors, alpha=0.7, edgecolor='black')
ax.set_xlabel('系数')
ax.set_title('变量系数')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()
# 8. 生存预测
new_patient = pd.DataFrame({
'group': [1],
'age': [65],
'risk_score': [75],
})
survival_prob = cph.predict_survival_function(new_patient, times=[6, 12, 24])
print(f"
8. 新患者生存预测(年龄65,治疗,风险75):")
print(f"6个月生存:{survival_prob.iloc[0, 0]:.1%}")
print(f"12个月生存:{survival_prob.iloc[1, 0]:.1%}")
print(f"24个月生存:{survival_prob.iloc[2, 0]:.1%}")
# 9. 比例风险假设
print(f"
9. 比例风险测试:")
from lifelines.statistics import proportional_hazard_assumption
ph_test = proportional_hazard_assumption(cph, df[['time', 'event', 'group', 'age', 'risk_score']],
time_transform='rank')
print(ph_test)
# 10. 汇总统计
print(f"
" + "="*50)
print("生存分析汇总")
print("="*50)
print(f"对照组中位生存:{df[df['group']==0]['time'].median():.1f} 月")
print(f"治疗组中位生存:{df[df['group']==1]['time'].median():.1f} 月")
print(f"Log-rank p值:{results.p_value:.4f}")
print(f"一致性指数:{concordance_index:.3f}")
print("="*50)
截尾类型
- 右截尾:事件未发生(最常见)
- 左截尾:事件发生在观察之前
- 区间截尾:事件在未知时间间隔内发生
模型比较
- Kaplan-Meier:描述,不解释
- Cox模型:调整协变量,比例风险
- 参数模型:假设分布
- 竞争风险:多种事件类型
应用
- 临床试验
- 设备可靠性
- 客户流失
- 员工留存
- 产品寿命
交付物
- Kaplan-Meier 生存曲线
- 生存概率估计
- Log-rank 测试结果
- Cox模型系数
- 风险比
- 风险分层组
- 生存预测
- 模型诊断