Skip to main content

Accuracy Evaluation

Judge agent responses against expected answers:
import asyncio
from definable.agent import Agent
from definable.agent.eval import AccuracyEval, EvalCase

agent = Agent(model="gpt-4o", instructions="Answer questions concisely.")

eval = AccuracyEval(
    judge_model="openai/gpt-4o-mini",
    threshold=7.0,  # Pass if score >= 7/10
)

async def main():
    result = await eval.arun(agent, EvalCase(
        input="What is the capital of France?",
        expected="Paris",
    ))
    print(f"Score: {result.score}/10 — {'PASS' if result.passed else 'FAIL'}")

asyncio.run(main())

Batch Evaluation

Run multiple cases and get aggregated metrics:
cases = [
    EvalCase(input="Capital of France?", expected="Paris"),
    EvalCase(input="Capital of Japan?", expected="Tokyo"),
    EvalCase(input="Capital of Brazil?", expected="Brasilia"),
]

suite = await eval.arun_batch(agent, cases)
print(f"Pass rate: {suite.pass_rate:.0%}")
print(f"Average score: {suite.average_score:.1f}/10")

Performance Evaluation

Measure runtime and memory usage:
from definable.agent.eval import PerformanceEval, EvalCase

perf = PerformanceEval(
    duration_threshold_ms=5000,  # Fail if slower than 5s
    memory_threshold_mb=100,     # Fail if uses more than 100MB
    warmup_runs=1,               # Warm up before measuring
)

result = await perf.arun(agent, EvalCase(input="Hello"))
print(f"Duration: {result.duration_ms:.0f}ms")
print(f"Memory: {result.memory_mb:.1f}MB")

Reliability Evaluation

Verify the agent calls expected tools:
from definable.agent.eval import ReliabilityEval, EvalCase

rel = ReliabilityEval(strict=True)  # All expected tools must be called

result = await rel.arun(agent, EvalCase(
    input="What's the weather in Paris?",
    expected_tools=["get_weather"],
))
print(f"Tools called: {result.tools_called}")
print(f"Passed: {result.passed}")

Agent-as-Judge Evaluation

Custom criteria evaluation:
from definable.agent.eval import AgentAsJudgeEval, EvalCase

judge = AgentAsJudgeEval(
    judge_model="openai/gpt-4o",
    criteria="Is the response helpful, accurate, and concise?",
    mode="numeric",  # or "binary"
)

result = await judge.arun(agent, EvalCase(
    input="Explain quantum computing in one sentence.",
))
print(f"Score: {result.score}/10")
print(f"Reasoning: {result.reasoning}")

Team Evaluation

All eval types support team evaluation:
result = await eval.arun_team(team, EvalCase(
    input="Research quantum computing advances",
    expected="A comprehensive summary...",
))