defvalidate_context_and_answer(example, pred, trace=None): answer_match = example.answer.lower() == pred.answer.lower() context_match = any((pred.answer.lower() in c) for c in pred.context) if trace isNone: return (answer_match + context_match) / 2.0 else: return answer_match and context_match
长文本如何评分?(更聪明的方法)
聪明的我们,可以通过使用 AI 自己的能力,通过使用另一个 DSPy 程序给长文打分。 懂得递归的朋友肯定会说,这题我熟。 如果你的指标本身是一个DSPy程序,可以优化它本身后,再来迭代改进最终的程序。
engaging = "Does the assessed text make for a self-contained, engaging tweet?" correct = f"The text should answer `{question}` with `{answer}`. Does the assessed text contain this answer?"
with dspy.context(lm=gpt4o): correct = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=correct) engaging = dspy.Predict(Assess)(assessed_text=tweet, assessment_question=engaging)
correct, engaging = [m.assessment_answer.lower() == 'yes'for m in [correct, engaging]] score = (correct + engaging) if correct and (len(tweet) <= 280) else0
defvalidate_hops(example, pred, trace=None): hops = [example.question] + [outputs.query for *_ , outputs in trace if'query'in outputs]
ifmax([len(h) for h in hops]) > 100: returnFalse ifany(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx inrange(2, len(hops))): returnFalse