🧠AI Eval 与观测
离线评测集
面试回答
常见问法
为什么 AI 应用需要离线评测集?怎么构建?
回答
离线评测集是系统优化的基线,没有它你很难判断一次修改是整体变好还是只是在少数样例上看起来更好。构建时通常要覆盖高频场景、难例、边界例和失败样例,并尽量保持标注标准一致。
# 离线评测集构建示例
class OfflineEvaluationDataset:
def __init__(self, dataset_name, version='v1'):
self.dataset_name = dataset_name
self.version = version
self.samples = []
self.metrics = {}
self.creation_time = time.time()
def add_sample(self, question, expected_answer,
category='general', difficulty='medium',
metadata=None):
"""
添加评测样本
参数:
- question: 问题
- expected_answer: 期望答案
- category: 问题类别
- difficulty: 难度级别
- metadata: 额外信息
"""
sample_id = len(self.samples) + 1
sample = {
'id': sample_id,
'question': question,
'expected_answer': expected_answer,
'category': category,
'difficulty': difficulty,
'metadata': metadata or {},
'created_at': time.time(),
'annotations': []
}
self.samples.append(sample)
return sample_id
def annotate_sample(self, sample_id, system_output,
annotator='system', notes=''):
"""
标注系统输出
1. 对比期望答案
2. 评分
3. 记录问题
"""
sample = self.get_sample(sample_id)
if not sample:
raise ValueError(f"样本{sample_id}不存在")
# 计算相似度/准确性
accuracy_score = self.calculate_accuracy(
system_output, sample['expected_answer']
)
# 评估回答质量
quality_metrics = self.evaluate_quality(system_output)
annotation = {
'annotator': annotator,
'timestamp': time.time(),
'system_output': system_output,
'accuracy_score': accuracy_score,
'quality_metrics': quality_metrics,
'notes': notes,
'issues': self.identify_issues(system_output, sample)
}
sample['annotations'].append(annotation)
return annotation
def calculate_accuracy(self, system_output, expected_answer):
"""
计算系统输出与期望答案的相似度
支持多种评估方法:
- 文本相似度
- 语义相似度
- 关键词匹配
"""
# 简单文本相似度(实际可用更复杂的算法)
from difflib import SequenceMatcher
similarity = SequenceMatcher(
None, system_output, expected_answer
).ratio()
# 关键词匹配
expected_keywords = set(expected_answer.split())
system_keywords = set(system_output.split())
keyword_overlap = len(expected_keywords & system_keywords) / \
len(expected_keywords) if expected_keywords else 0
# 综合评分
combined_score = 0.6 * similarity + 0.4 * keyword_overlap
return {
'text_similarity': similarity,
'keyword_overlap': keyword_overlap,
'combined_score': combined_score
}
def evaluate_quality(self, system_output):
"""
评估回答质量
1. 完整性
2. 一致性
3. 可读性
4. 安全性
"""
quality_scores = {}
# 完整性:是否回答了问题
quality_scores['completeness'] = self.check_completeness(system_output)
# 一致性:回答是否自相矛盾
quality_scores['consistency'] = self.check_consistency(system_output)
# 可读性:是否清晰易懂
quality_scores['readability'] = self.check_readability(system_output)
# 安全性:是否有有害内容
quality_scores['safety'] = self.check_safety(system_output)
# 综合质量评分
quality_scores['overall'] = sum(
quality_scores[k] for k in ['completeness', 'consistency', 'readability']
) / 3
return quality_scores
def identify_issues(self, system_output, sample):
"""
识别系统输出的问题
1. 事实错误
2. 逻辑错误
3. 不完整回答
4. 无关信息
"""
issues = []
# 检查事实错误(简化示例)
if self.contains_factual_error(system_output, sample):
issues.append({
'type': 'factual_error',
'severity': 'high',
'description': '包含事实性错误'
})
# 检查不完整
if self.is_incomplete(system_output, sample):
issues.append({
'type': 'incomplete',
'severity': 'medium',
'description': '回答不完整'
})
# 检查无关信息
if self.contains_irrelevant_info(system_output, sample):
issues.append({
'type': 'irrelevant',
'severity': 'low',
'description': '包含无关信息'
})
return issues
def run_evaluation(self, system):
"""
运行完整的评测
1. 对每个样本获取系统输出
2. 自动标注
3. 计算整体指标
"""
results = {
'total_samples': len(self.samples),
'by_category': {},
'by_difficulty': {},
'overall_metrics': {}
}
# 按类别和难度分组统计
for category in set(s['category'] for s in self.samples):
results['by_category'][category] = {
'count': 0,
'accuracy': 0,
'quality': 0
}
for difficulty in set(s['difficulty'] for s in self.samples):
results['by_difficulty'][difficulty] = {
'count': 0,
'accuracy': 0,
'quality': 0
}
# 评估每个样本
for sample in self.samples:
# 获取系统输出
system_output = system.generate(sample['question'])
# 自动标注
annotation = self.annotate_sample(
sample['id'], system_output, annotator='auto'
)
# 更新统计
category = sample['category']
difficulty = sample['difficulty']
results['by_category'][category]['count'] += 1
results['by_category'][category]['accuracy'] += \
annotation['accuracy_score']['combined_score']
results['by_category'][category]['quality'] += \
annotation['quality_metrics']['overall']
results['by_difficulty'][difficulty]['count'] += 1
results['by_difficulty'][difficulty]['accuracy'] += \
annotation['accuracy_score']['combined_score']
results['by_difficulty'][difficulty]['quality'] += \
annotation['quality_metrics']['overall']
# 计算平均值
for category_stats in results['by_category'].values():
if category_stats['count'] > 0:
category_stats['accuracy'] /= category_stats['count']
category_stats['quality'] /= category_stats['count']
for difficulty_stats in results['by_difficulty'].values():
if difficulty_stats['count'] > 0:
difficulty_stats['accuracy'] /= difficulty_stats['count']
difficulty_stats['quality'] /= difficulty_stats['count']
# 计算整体指标
total_accuracy = sum(
stats['accuracy'] for stats in results['by_category'].values()
) / len(results['by_category'])
total_quality = sum(
stats['quality'] for stats in results['by_category'].values()
) / len(results['by_category'])
results['overall_metrics'] = {
'accuracy': total_accuracy,
'quality': total_quality,
'hallucination_rate': self.calculate_hallucination_rate()
}
return results
def calculate_hallucination_rate(self):
"""
计算幻觉率
统计包含事实错误的比例
"""
total_samples = len(self.samples)
if total_samples == 0:
return 0
hallucinated_samples = 0
for sample in self.samples:
for annotation in sample['annotations']:
issues = annotation.get('issues', [])
if any(issue['type'] == 'factual_error' for issue in issues):
hallucinated_samples += 1
break
return hallucinated_samples / total_samples
# 评测集使用示例
def build_evaluation_dataset():
"""构建一个高质量的评测集"""
dataset = OfflineEvaluationDataset("AI问答系统评测", version="v1.0")
# 添加各类样本
dataset.add_sample(
question="什么是机器学习?",
expected_answer="机器学习是人工智能的一个分支,通过算法让计算机从数据中学习规律并进行预测或决策。",
category="general",
difficulty="easy"
)
dataset.add_sample(
question="如何处理RAG系统中的幻觉问题?",
expected_answer="降低幻觉需要多层防护:1)优化检索链路,提高召回质量;2)使用引用机制,确保回答有据可查;3)设计拒答机制;4)持续评测和优化。",
category="rag",
difficulty="hard"
)
dataset.add_sample(
question="Tool Calling和普通的函数调用有什么区别?",
expected_answer="Tool Calling是模型自主决定调用什么工具,而普通函数调用是程序预设好的。Tool Calling让模型能够利用外部工具扩展能力。",
category="agent",
difficulty="medium"
)
return dataset
def demo_evaluation():
"""演示评测过程"""
# 构建评测集
dataset = build_evaluation_dataset()
# 创建模拟系统
class DemoSystem:
def generate(self, question):
# 模拟系统输出
return f"这是对问题'{question}'的模拟回答。"
system = DemoSystem()
# 运行评测
results = dataset.run_evaluation(system)
# 打印结果
print(f"评测完成,共{results['total_samples']}个样本")
print(f"整体准确率:{results['overall_metrics']['accuracy']:.2%}")
print(f"整体质量:{results['overall_metrics']['quality']:.2%}")
print(f"幻觉率:{results['overall_metrics']['hallucination_rate']:.2%}")
# 按类别分析
print("\n按类别分析:")
for category, stats in results['by_category'].items():
print(f" {category}: {stats['accuracy']:.2%} 准确率, {stats['count']}个样本")
# 按难度分析
print("\n按难度分析:")
for difficulty, stats in results['by_difficulty'].items():
print(f" {difficulty}: {stats['accuracy']:.2%} 准确率, {stats['count']}个样本")
追问
- 为什么评测集要持续更新?(数据分布变化)
- RAG 和 Agent 的评测集为什么不太一样?(任务类型不同)
- 如果人工标注成本高,怎么控制投入?(半自动化)
原理展开
AI 系统变化点很多,模型、Prompt、检索参数、工具链路都会影响结果。离线评测集的作用是把这些变化放到同一个对照框架里比较,避免优化完全依赖主观印象。
构建上最重要的是代表性和可复现性。宁可先做小而稳定的评测集,也不要只靠零散案例做判断。
# 评测集质量控制
class EvaluationQualityControl:
def __init__(self):
self.annotators = []
self.quality_threshold = 0.85
def add_annotator(self, annotator_id, expertise_level='medium'):
"""添标注员"""
self.annotators.append({
'id': annotator_id,
'expertise': expertise_level,
'reliability_score': 1.0,
'annotation_count': 0
})
def assign_annotation_task(self, sample, annotators=None):
"""
分配标注任务
1. 根据样本难度选择合适的标注员
2. 分配多个标注员以提高可靠性
"""
if annotators is None:
annotators = self.annotators
# 根据难度选择标注员
if sample['difficulty'] == 'hard':
# 难题需要专家标注
suitable_annotators = [
a for a in annotators
if a['expertise'] in ['expert', 'senior']
]
else:
# 普通题可以普通标注
suitable_annotators = annotators
# 至少分配2个标注员
selected_annotators = random.sample(
suitable_annotators,
min(2, len(suitable_annotators))
)
return [a['id'] for a in selected_annotators]
def calculate_annotator_reliability(self, annotator_id):
"""
计算标注员可靠性
基于:
1. 与其他标注员的一致性
2. 标注质量评分
3. 标注数量
"""
annotator = self.get_annotator(annotator_id)
# 计算与其他标注员的一致性
consistency_score = self.calculate_consistency(annotator_id)
# 计算标注质量
quality_score = self.calculate_quality_score(annotator_id)
# 计算标注数量(经验因素)
experience_factor = min(annotator['annotation_count'] / 100, 1.0)
# 综合可靠性评分
reliability = 0.5 * consistency_score + \
0.3 * quality_score + \
0.2 * experience_factor
# 更新标注员可靠性
annotator['reliability_score'] = reliability
return reliability
def consensus_annotation(self, sample, annotations):
"""
达成共识的标注
1. 检查标注一致性
2. 解决分歧
3. 生成最终标注
"""
if len(annotations) < 2:
return annotations[0] if annotations else None
# 计算标注相似度
similarities = []
for i in range(len(annotations)):
for j in range(i + 1, len(annotations)):
sim = self.calculate_similarity(
annotations[i], annotations[j]
)
similarities.append(sim)
avg_similarity = sum(similarities) / len(similarities)
# 如果一致性高,选择可靠性最高的标注
if avg_similarity > 0.8:
annotator_scores = []
for i, annotation in enumerate(annotations):
annotator_id = annotation['annotator']
reliability = self.get_annotator(annotator_id)['reliability_score']
annotator_scores.append((i, reliability))
# 选择可靠性最高的
best_idx = max(annotator_scores, key=lambda x: x[1])[0]
return annotations[best_idx]
# 一致性低,需要仲裁
return self.arbitrate_disagreement(sample, annotations)
def update_dataset_version(self, dataset, changes_description):
"""
更新评测集版本
1. 记录变更
2. 版本控制
3. 确保可复现
"""
new_version = self.increment_version(dataset.version)
# 创建新版本
new_dataset = OfflineEvaluationDataset(
dataset.dataset_name,
version=new_version
)
# 复制样本
for sample in dataset.samples:
new_dataset.add_sample(
question=sample['question'],
expected_answer=sample['expected_answer'],
category=sample['category'],
difficulty=sample['difficulty'],
metadata={
**sample['metadata'],
'original_version': dataset.version,
'upgrade_reason': changes_description
}
)
# 记录变更日志
upgrade_log = {
'timestamp': time.time(),
'from_version': dataset.version,
'to_version': new_version,
'changes': changes_description,
'maintainer': 'system'
}
self.save_upgrade_log(dataset.dataset_name, upgrade_log)
return new_dataset
易错点
- 样本只覆盖简单场景(缺乏代表性)
- 每次评测标准都在变,导致结果不可比较(缺乏一致性)
- 忽略标注质量(主观性影响)
- 评测集过时(数据分布变化)
记忆技巧
记住评测集构建三要素:
- 代表性 = “覆盖各种场景”
- 一致性 = “标准统一”
- 持续性 = “定期更新”
典型应用场景:
- 模型优化:对比不同版本性能
- 参数调优:评估配置效果
- 算法选择:比较不同方案
- 回归测试:防止性能退化