name: calibration-trainer description: 用于提高预测准确性和减少过度自信的概率校准训练技能 allowed-tools:
- Read
- Write
- Glob
- Grep
- Bash
metadata:
specialization: 决策智能
domain: 商业
category: 协作
priority: medium
tools-libraries:
- numpy
- matplotlib
- custom quiz engines
校准训练师
概述
校准训练师技能提供评估和改进预测者校准的能力。它帮助决策者将他们的置信水平与实际准确性对齐,减少过度自信,提高概率判断的质量。
能力
- 校准测验生成
- 置信区间引导
- Brier分数计算
- 校准曲线绘制
- 过度自信/自信不足诊断
- 训练练习管理
- 随时间进度跟踪
- 基准比较
使用流程
- 认知偏见去偏过程
- 决策质量评估
- 预测分析实施
使用方法
校准测验
# 生成校准测验
quiz_config = {
"type": "general_knowledge",
"format": "confidence_interval",
"questions": 20,
"confidence_levels": [50, 80, 90], # 引导的百分位数
"difficulty": "medium",
"domains": ["business", "economics", "technology", "geography"]
}
# 示例问题
quiz_question = {
"id": "Q001",
"question": "亚马逊成立于哪一年?",
"actual_answer": 1994,
"format": "numeric_interval",
"required_responses": [
{"confidence": 50, "prompt": "给出你的最佳估计"},
{"confidence": 80, "prompt": "给出一个你80%确信包含答案的范围"},
{"confidence": 90, "prompt": "给出一个你90%确信包含答案的范围"}
]
}
响应收集
# 收集响应
responses = {
"participant": "张三",
"date": "2024-01-15",
"questions": [
{
"question_id": "Q001",
"responses": {
"point_estimate": 1997,
"interval_80": [1995, 2000],
"interval_90": [1992, 2002]
}
}
# ... 更多问题
]
}
校准分析
# 分析校准
calibration_analysis = {
"participant": "张三",
"n_questions": 20,
"by_confidence_level": {
"80%_intervals": {
"expected_hit_rate": 0.80,
"actual_hit_rate": 0.55,
"calibration_gap": -0.25,
"interpretation": "过度自信"
},
"90%_intervals": {
"expected_hit_rate": 0.90,
"actual_hit_rate": 0.70,
"calibration_gap": -0.20,
"interpretation": "过度自信"
}
},
"brier_score": 0.18, # 越低越好,0 = 完美
"overconfidence_index": 0.23,
"recommendations": [
"将置信区间扩大约25%",
"练习特定领域的问题",
"使用参考类思维"
]
}
训练练习
# 校准训练计划
training_program = {
"participant": "张三",
"baseline_calibration": 0.55, # 80%区间的命中率
"target_calibration": 0.75,
"exercises": [
{
"week": 1,
"focus": "interval_widening",
"exercise": "练习给出比直觉宽50%的区间",
"quiz_count": 10
},
{
"week": 2,
"focus": "reference_class",
"exercise": "对于每个估计,首先识别一个参考类",
"quiz_count": 10
},
{
"week": 3,
"focus": "decomposition",
"exercise": "将复杂估计分解为组成部分",
"quiz_count": 10
},
{
"week": 4,
"focus": "consolidation",
"exercise": "应用所有技术,跟踪改进",
"quiz_count": 20
}
]
}
进度跟踪
# 随时间跟踪进度
progress_data = {
"participant": "张三",
"history": [
{"date": "2024-01-01", "hit_rate_80": 0.55, "brier_score": 0.22},
{"date": "2024-01-15", "hit_rate_80": 0.62, "brier_score": 0.19},
{"date": "2024-02-01", "hit_rate_80": 0.68, "brier_score": 0.16},
{"date": "2024-02-15", "hit_rate_80": 0.74, "brier_score": 0.13}
],
"trend": "improving",
"improvement_rate": "每次会话4%"
}
输入模式
{
"operation": "quiz|analyze|train|track",
"quiz_config": {
"type": "string",
"format": "string",
"questions": "number",
"confidence_levels": ["number"]
},
"responses": {
"participant": "string",
"questions": ["object"]
},
"training_config": {
"target_calibration": "number",
"duration_weeks": "number"
}
}
输出模式
{
"quiz": {
"questions": ["object"],
"total_count": "number"
},
"calibration_analysis": {
"by_confidence_level": "object",
"brier_score": "number",
"overconfidence_index": "number",
"calibration_curve": "object"
},
"recommendations": ["string"],
"progress": {
"history": ["object"],
"trend": "string",
"target_achieved": "boolean"
}
}
校准指标
| 指标 | 公式 | 解释 |
|---|---|---|
| 命中率 | 包含真实值的区间百分比 | 应与置信水平匹配 |
| Brier分数 | 概率的均方误差 | 越低越好(0-1) |
| 校准差距 | 预期 - 实际命中率 | 正值 = 过度自信 |
| 过度自信指数 | 平均校准差距 | 量化整体偏差 |
校准曲线
一个校准良好的预测者具有:
- 50%的区间在50%的时间内捕获真相
- 80%的区间在80%的时间内捕获真相
- 90%的区间在90%的时间内捕获真相
校准曲线绘制了陈述的置信度与观察到的准确性。
最佳实践
- 每次测验后立即使用反馈
- 按领域分别跟踪校准
- 专注于最常见的置信水平(80%,90%)
- 定期练习(每周比每月好)
- 为商业应用使用领域相关问题
- 与校准良好的基准(超级预测者)比较
- 庆祝改进,而不仅仅是准确性
改进校准的技术
| 技术 | 描述 |
|---|---|
| 扩大区间 | 从更宽开始,只有在有强证据时才缩小 |
| 参考类 | 使用类似情况的基础率 |
| 分解 | 将估计分解为组成部分 |
| 魔鬼代言人 | 积极寻找减少自信的理由 |
| 事前验尸 | 想象自己错了,找出原因 |
集成点
- 输入到决策质量评估
- 与风险分布拟合器连接,用于专家引导
- 支持去偏教练代理
- 与参考类预测器集成,用于基础率思维