RAG Citations Grounding
概览
RAG citations grounding涵盖了系统化收集、管理和使用真实数据以评估和提高RAG系统准确性的过程。这项技能包括真实数据收集、引用评估、准确性指标和持续改进策略。
**何时使用这项技能:**在评估RAG系统准确性、构建反馈循环或实施质量测量系统时使用。
目录
- 真实数据收集
- 引用评估
- 准确性指标
- 反馈循环
- Grounding Checklist
- 快速参考
真实数据收集
真实数据结构
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "RAG Ground Truth Entry",
"type": "object",
"required": ["query_id", "question", "correct_answer", "relevant_chunks"],
"properties": {
"query_id": {
"type": "string",
"description": "唯一查询标识符"
},
"question": {
"type": "string",
"description": "用户问题"
},
"correct_answer": {
"type": "string",
"description": "基于真实数据的正确答案"
},
"relevant_chunks": {
"type": "array",
"items": {
"type": "string"
},
"description": "相关块ID列表"
},
"answer_type": {
"type": "string",
"enum": ["factual", "opinion", "procedural", "multi-part"],
"description": "预期答案类型"
},
"domain": {
"type": "string",
"description": "问题的领域"
},
"difficulty": {
"type": "string",
"enum": ["easy", "medium", "hard"],
"description": "难度等级"
},
"created_at": {
"type": "string",
"format": "date-time",
"description": "创建时间戳"
},
"updated_at": {
"type": "string",
"format": "date-time",
"description": "最后更新时间戳"
}
}
}
真实数据来源
| 来源 |
描述 |
数据质量 |
收集方法 |
| 专家评审 |
领域专家评审答案 |
高 |
手动标注 |
| 用户反馈 |
用户对答案进行评分 |
中 |
用户反馈表单 |
| 自动化测试 |
已知Q&A数据集 |
高 |
程序化提取 |
| 文档 |
官方文档作为真实数据 |
高 |
手动提取 |
| 合成数据 |
生成测试用例 |
中 |
LLM生成 |
收集策略
# 真实数据收集策略
class GroundTruthCollector:
def __init__(self):
pass
async def collect_from_expert(self, query: str, answer: str, relevant_docs: list) -> dict:
"""从专家评审收集真实数据"""
return {
'query_id': self._generate_id(),
'question': query,
'correct_answer': answer,
'relevant_chunks': relevant_docs,
'answer_type': self._classify_answer_type(answer),
'domain': self._infer_domain(query),
'difficulty': self._assess_difficulty(query, answer),
'source': 'expert_review',
'created_at': datetime.utcnow().isoformat()
}
async def collect_from_user(self, query: str, answer: str, rating: int) -> dict:
"""从用户反馈收集真实数据"""
return {
'query_id': self._generate_id(),
'question': query,
'correct_answer': answer,
'relevant_chunks': [],
'answer_type': self._classify_answer_type(answer),
'domain': self._infer_domain(query),
'difficulty': self._assess_difficulty(query, answer),
'source': 'user_feedback',
'rating': rating,
'created_at': datetime.utcnow().isoformat()
}
def _generate_id(self) -> str:
"""生成唯一查询ID"""
import uuid
return str(uuid.uuid4())
def _classify_answer_type(self, answer: str) -> str:
"""分类答案类型"""
answer_lower = answer.lower()
# 检查事实答案
factual_indicators = ['yes', 'no', 'true', 'false', 'correct', 'incorrect', 'right', 'wrong']
if any(indicator in answer_lower for indicator in factual_indicators):
return 'factual'
# 检查意见答案
opinion_indicators = ['i think', 'in my opinion', 'personally', 'believe', 'feel', 'consider']
if any(indicator in answer_lower for indicator in opinion_indicators):
return 'opinion'
# 检查程序性答案
procedural_indicators = ['first', 'then', 'next', 'after that', 'follow these steps', 'you should']
if any(indicator in answer_lower for indicator in procedural_indicators):
return 'procedural'
# 默认为多部分
return 'multi-part'
def _infer_domain(self, query: str) -> str:
"""从查询中推断领域"""
query_lower = query.lower()
domain_keywords = {
'technical': ['code', 'programming', 'database', 'api', 'algorithm', 'system'],
'business': ['sales', 'marketing', 'finance', 'customer', 'revenue'],
'legal': ['contract', 'legal', 'compliance', 'regulation', 'policy'],
'product': ['feature', 'product', 'pricing', 'release', 'version'],
'hr': ['hiring', 'employee', 'onboarding', 'policy', 'benefit']
}
for domain, keywords in domain_keywords.items():
if any(keyword in query_lower for keyword in keywords):
return domain
return 'general'
def _assess_difficulty(self, query: str, answer: str) -> str:
"""评估问题难度"""
# 简单的基于启发式的难度评估
query_length = len(query.split())
answer_length = len(answer.split())
# 更复杂的问题更难
if query_length > 20 or answer_length > 100:
return 'hard'
elif query_length > 10 or answer_length > 50:
return 'medium'
else:
return 'easy'
引用评估
引用指标
| 指标 |
描述 |
计算 |
| 引用准确性 |
正确引用的百分比 |
正确引用 / 总引用 |
| 检索精确度 |
检索到的相关块的百分比 |
相关检索到 / 总检索到 |
| 检索召回率 |
找到的相关块的百分比 |
相关找到 / 总相关 |
| F1分数 |
精确度和召回率的调和平均数 |
2 * (精确度 * 召回率) / (精确度 + 召回率) |
| MRR |
平均倒数排名 |
第一个相关块的排名平均值 |
| NDCG |
归一化折扣累积增益 |
每个位置的增益 |
评估过程
graph TD
A[查询] --> B[RAG响应]
B --> C{提取引用}
C --> D{匹配真实数据}
D --> E[计算指标]
E --> F{生成报告}
引用匹配
# 引用匹配逻辑
class CitationMatcher:
def __init__(self, similarity_threshold: float = 0.7):
self.similarity_threshold = similarity_threshold
def match_citation(self, response: str, ground_truth: dict) -> list:
"""将响应中的引用与真实数据匹配"""
# 从响应中提取引用
citations = self._extract_citations(response)
# 将每个引用与真实数据匹配
matched_citations = []
for citation in citations:
# 检查引用是否与任何相关块匹配
for chunk_id in ground_truth['relevant_chunks']:
if self._is_match(citation, chunk_id):
matched_citations.append({
'citation': citation,
'chunk_id': chunk_id,
'confidence': self._calculate_confidence(citation, chunk_id)
})
break
return matched_citations
def _extract_citations(self, response: str) -> list:
"""从响应中提取引用"""
import re
# 简单的基于模式的引用提取
# 在生产中,使用NER或更复杂的方法
citation_patterns = [
r'\[(\d+)\]', # [1], [2], 等。
r'\(([^)]+)\)', # (Source 1, Source 2)
r'Source:\s*(\d+)', # Source: 123
r'reference:\s*(\d+)', # Reference: 123
r'from\s+(\d+)', # From: 123
]
citations = []
for pattern in citation_patterns:
matches = re.findall(pattern, response)
citations.extend(matches)
return citations
def _is_match(self, citation: str, chunk_id: str) -> bool:
"""检查引用是否匹配块ID"""
# 从引用中提取块ID
chunk_id_match = re.search(r'\d+', citation)
if chunk_id_match:
return chunk_id_match.group() == chunk_id
return False
def _calculate_confidence(self, citation: str, chunk_id: str) -> float:
"""计算引用置信度分数"""
# 简单的置信度计算
# 在生产中,使用更复杂的方法
confidence = 0.5 # 默认置信度
# 检查完全匹配
if chunk_id in citation:
confidence = 1.0
# 检查部分匹配
if any(word in citation.lower() for word in ['see', 'refer', 'document', 'section']):
confidence += 0.3
return min(confidence, 1.0)
准确性指标
指标计算
# 准确性指标计算
class AccuracyCalculator:
def __init__(self):
pass
def calculate_citation_accuracy(self, ground_truths: list) -> float:
"""计算引用准确性"""
total_citations = 0
correct_citations = 0
for gt in ground_truths:
total_citations += len(gt.get('relevant_chunks', []))
# 计算正确引用
# 这将与实际的RAG响应进行比较
# 现在,假设所有引用都是正确的
correct_citations += len(gt.get('relevant_chunks', []))
if total_citations == 0:
return 0.0
return correct_citations / total_citations
def calculate_retrieval_precision(self, ground_truths: list, retrievals: list) -> float:
"""计算检索精确度"""
total_retrieved = len(retrievals)
relevant_retrieved = 0
for gt in ground_truths:
relevant_chunks = gt.get('relevant_chunks', [])
for rv in retrievals:
if rv.get('chunk_id') in relevant_chunks:
relevant_retrieved += 1
if total_retrieved == 0:
return 0.0
return relevant_retrieved / total_retrieved
def calculate_retrieval_recall(self, ground_truths: list, retrievals: list) -> float:
"""计算检索召回率"""
total_relevant = 0
found_relevant = 0
for gt in ground_truths:
relevant_chunks = gt.get('relevant_chunks', [])
for rv in retrievals:
if rv.get('chunk_id') in relevant_chunks:
found_relevant += 1
total_relevant = sum(len(gt.get('relevant_chunks', [])) for gt in ground_truths)
if total_relevant == 0:
return 0.0
return found_relevant / total_relevant
def calculate_f1(self, precision: float, recall: float) -> float:
"""计算F1分数"""
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)
def calculate_mrr(self, ground_truths: list, retrievals: list) -> float:
"""计算平均倒数排名"""
mrr_scores = []
for gt in ground_truths:
relevant_chunks = gt.get('relevant_chunks', [])
if not relevant_chunks:
continue
# 查找第一个相关块的排名
for i, rv in enumerate(retrievals):
if rv.get('chunk_id') in relevant_chunks:
mrr_scores.append(1 / (i + 1))
break
if not mrr_scores:
return 0.0
return sum(mrr_scores) / len(mrr_scores)
def calculate_ndcg(self, ground_truths: list, retrievals: list, k: int = 10) -> float:
"""计算归一化折扣累积增益"""
dcg = 0.0
for gt in ground_truths:
relevant_chunks = gt.get('relevant_chunks', [])
if not relevant_chunks:
continue
# 计算这个查询的DCG
for i, rv in enumerate(retrievals[:k]):
if rv.get('chunk_id') in relevant_chunks:
dcg += 1 / (i + 1)
break
# 通过理想DCG归一化
ideal_dcg = sum(1 / (i + 1) for i in range(k))
if ideal_dcg == 0:
return 0.0
return dcg / ideal_dcg
反馈循环
反馈收集
# 反馈收集和分析
class FeedbackCollector:
def __init__(self, feedback_store):
self.feedback_store = feedback_store
async def collect_feedback(self, query_id: str, user_id: str, rating: int, feedback: str) -> dict:
"""收集用户对RAG响应的反馈"""
feedback = {
'query_id': query_id,
'user_id': user_id,
'rating': rating,
'feedback': feedback,
'created_at': datetime.utcnow().isoformat()
}
await self.feedback_store.store(feedback)
return feedback
async def analyze_feedback(self, limit: int = 100) -> dict:
"""分析最近的反馈"""
# 获取最近的反馈
feedbacks = await self.feedback_store.get_recent(limit)
# 计算指标
avg_rating = sum(f['rating'] for f in feedbacks) / len(feedbacks)
# 识别问题
issues = []
for f in feedbacks:
if f['rating'] < 3:
issues.append(f['feedback'])
return {
'average_rating': avg_rating,
'total_feedback': len(feedbacks),
'common_issues': issues
}
async def update_ground_truth(self, query_id: str, correct_answer: str) -> dict:
"""根据反馈更新真实数据"""
# 用正确的答案更新真实数据
await self.feedback_store.update_ground_truth(query_id, correct_answer)
return {'query_id': query_id, 'status': 'updated'}
持续改进
# 持续改进循环
class ContinuousImprovement:
def __init__(self, ground_truth_store, retriever):
self.gt_store = ground_truth_store
self.retriever = retriever
async def run_improvement_cycle(self):
"""运行一个改进循环"""
# 1. 收集最近的查询
recent_queries = await self.gt_store.get_recent_queries(limit=100)
# 2. 评估检索质量
for query in recent_queries:
retrievals = await self.retriever.retrieve(query['question'])
accuracy = self._evaluate_accuracy(query, retrievals)
# 更新查询指标
await self.gt_store.update_metrics(query['query_id'], accuracy)
# 3. 识别改进领域
improvement_areas = await self._identify_improvements(recent_queries)
# 4. 生成建议
recommendations = self._generate_recommendations(improvement_areas)
return {
'queries_evaluated': len(recent_queries),
'average_accuracy': self._calculate_average_accuracy(recent_queries),
'improvement_areas': improvement_areas,
'recommendations': recommendations
}
def _evaluate_accuracy(self, query: dict, retrievals: list) -> float:
"""评估查询的检索准确性"""
gt = await self.gt_store.get(query['query_id'])
relevant_chunks = gt.get('relevant_chunks', [])
# 计算检索到的相关块数量
retrieved_relevant = 0
for rv in retrievals:
if rv.get('chunk_id') in relevant_chunks:
retrieved_relevant += 1
if len(retrievals) == 0:
return 0.0
return retrieved_relevant / len(relevant_chunks)
def _calculate_average_accuracy(self, queries: list) -> float:
"""计算查询的平均准确性"""
accuracies = []
for query in queries:
metrics = await self.gt_store.get_metrics(query['query_id'])
accuracies.append(metrics.get('accuracy', 0.0))
if not accuracies:
return 0.0
return sum(accuracies) / len(accuracies)
def _identify_improvements(self, queries: list) -> list:
"""识别改进领域"""
# 分析表现不佳的查询
low_performance = []
for query in queries:
metrics = await self.gt_store.get_metrics(query['query_id'])
if metrics.get('accuracy', 0.0) < 0.7:
low_performance.append(query['query_id'])
# 识别模式
improvement_areas = {
'retrieval_accuracy': '改进块选择和嵌入模型',
'citation_format': '标准化响应中的引用格式',
'query_expansion': '添加查询扩展技术',
'context_window': '优化上下文窗口大小'
}
return improvement_areas
def _generate_recommendations(self, improvements: dict) -> list:
"""生成改进建议"""
recommendations = []
if improvements.get('retrieval_accuracy'):
recommendations.append('审查和改进分块策略')
recommendations.append('评估嵌入模型性能')
recommendations.append('考虑混合检索方法')
if improvements.get('citation_format'):
recommendations.append('实施结构化引用格式')
recommendations.append('训练模型包含引用')
if improvements.get('query_expansion'):
recommendations.append('添加查询重写和扩展')
recommendations.append('实施多查询检索')
return recommendations
真实数据检查表
数据收集
## 数据收集检查表
### 真实数据设置
- [ ] 定义真实数据模式
- [ ] 确定数据源
- [ ] 实施收集方法
- [ ] 定义验证规则
- [ ] 配置存储
- [ ] 定义访问控制
### 收集过程
- [ ] 收集查询
- [ ] 进行专家评审
- [ ] 捕获用户反馈
- [ ] 跟踪质量指标
- [ ] 落实偏见预防措施
评估过程
## 评估过程检查表
### 指标计算
- [ ] 计算引用准确性
- [ ] 测量检索精确度
- [ ] 测量检索召回率
- [ ] 计算F1分数
- [ ] 计算MRR
- [ ] 计算NDCG
- [ ] 建立基线
### 质量评估
- [ ] 定义准确性目标
- [ ] 设置警报阈值
- [ ] 实施趋势分析
- [ ] 跟踪与基线的比较
- [ ] 记录改进措施
反馈整合
## 反馈整合检查表
### 反馈收集
- [ ] 实施反馈表单
- [ ] 定义评分系统
- [ ] 捕获自由形式反馈
- [ ] 实施情感分析
- [ ] 定义问题分类
### 反馈分析
- [ ] 识别常见问题
- [ ] 实施趋势分析
- [ ] 发现与指标的相关性
- [ ] 生成行动项目
快速参考
真实数据操作
# 真实数据操作
from typing import Dict, List
class GroundTruthOperations:
def __init__(self, gt_store):
self.store = gt_store
async def create_ground_truth(self, query: str, answer: str, relevant_docs: list) -> str:
"""创建真实数据条目"""
gt_entry = {
'query_id': self._generate_id(),
'question': query,
'correct_answer': answer,
'relevant_chunks': relevant_docs,
'answer_type': self._classify_answer_type(answer),
'domain': self._infer_domain(query),
'difficulty': self._assess_difficulty(query, answer),
'source': 'manual',
'created_at': datetime.utcnow().isoformat()
}
await self.store.create(gt_entry)
return gt_entry['query_id']
async def get_ground_truth(self, query_id: str) -> dict:
"""获取查询的真实数据"""
return await self.store.get(query_id)
async def update_ground_truth(self, query_id: str, updates: dict) -> dict:
"""更新真实数据"""
return await self.store.update(query_id, updates)
async def search_ground_truth(self, filters: dict) -> list:
"""搜索真实数据条目"""
return await self.store.search(filters)
async def evaluate_response(self, query_id: str, response: str) -> dict:
"""评估RAG响应与真实数据"""
gt = await self.get_ground_truth(query_id)
# 评估引用
citations = self._extract_citations(response)
matched = self._match_citations_to_ground_truth(citations, gt)
# 计算准确性
accuracy = len(matched) / len(citations) if citations else 0
return {
'query_id': query_id,
'accuracy': accuracy,
'matched_citations': matched,
'total_citations': len(citations)
}
评估指标
# 评估指标计算
class EvaluationMetrics:
def __init__(self):
pass
def calculate_all_metrics(self, ground_truths: list, retrievals: list) -> dict:
"""计算所有评估指标"""
metrics = {
'citation_accuracy': 0.0,
'retrieval_precision': 0.0,
'retrieval_recall': 0.0,
'f1_score': 0.0,
'mrr': 0.0,
'ndcg': 0.0
}
total_queries = len(ground_truths)
for gt in ground_truths:
# 获取这个查询的检索
query_retrievals = [rv for rv in retrievals if rv.get('query_id') == gt['query_id']]
# 计算这个查询的指标
if query_retrievals:
# 引用准确性
citations = self._extract_citations_from_retrievals(query_retrievals)
matched = self._match_citations(citations, gt)
metrics['citation_accuracy'] += len(matched) / len(citations) if citations else 0
# 检索精确度
relevant_count = len(gt.get('relevant_chunks', []))
retrieved_relevant = sum(1 for rv in query_retrievals if rv.get('chunk_id') in gt.get('relevant_chunks', []))
metrics['retrieval_precision'] += retrieved_relevant / (len(query_retrievals) * relevant_count) if query_retrievals else 0
# 检索召回率
metrics['retrieval_recall'] += retrieved_relevant / relevant_count if relevant_count > 0 else 0
# F1分数
precision = metrics['retrieval_precision']
recall = metrics['retrieval_recall']
metrics['f1_score'] += 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return metrics
def _extract_citations_from_retrievals(self, retrievals: list) -> list:
"""从检索中提取引用"""
citations = []
for rv in retrievals:
if rv.get('citations'):
citations.extend(rv['citations'])
return citations
def _match_citations(self, citations: list, ground_truth: dict) -> list:
"""将引用与真实数据匹配"""
matched = []
relevant_chunks = ground_truth.get('relevant_chunks', [])
for citation in citations:
for chunk_id in relevant_chunks:
if chunk_id in citation:
matched.append(citation)
return matched
指标阈值
| 指标 |
良好 |
需要改进 |
严重 |
| 引用准确性 |
> 90% |
< 70% |
< 50% |
| 检索精确度 |
> 85% |
< 70% |
< 50% |
| 检索召回率 |
> 85% |
< 70% |
< 50% |
| F1分数 |
> 0.85 |
< 0.70 |
< 0.50 |
| MRR |
> 0.90 |
< 0.70 |
< 0.50 |
常见陷阱
- 没有真实数据 - 没有基线数据无法改进
- 收集稀疏 - 需要多样化、有代表性的真实数据
- 没有验证 - 验证真实数据质量
- 忽视反馈 - 用户反馈很有价值
- 错误的指标 - 使用标准、可比较的指标
- 没有持续改进 - 系统没有反馈循环会退化
- 数据偏见 - 确保多样化、有代表性的真实数据
- 没有文档 - 文档化指标和方法论
额外资源