Documentation Index
Fetch the complete documentation index at: https://docs.abv.dev/llms.txt
Use this file to discover all available pages before exploring further.
Evaluations help you measure and improve LLM output quality through datasets, automated scoring, and experiments.
Score an Observation
Add quality scores to any traced observation.- Python
- JavaScript
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
@observe()
def generate_response(query: str) -> str:
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": query}]
)
output = response.choices[0].message.content
# Score this observation
abv.score_current_span(
name="relevance",
value=0.95,
data_type="NUMERIC",
comment="Highly relevant response"
)
# Boolean score
abv.score_current_span(
name="is_helpful",
value=1.0,
data_type="BOOLEAN"
)
# Categorical score
abv.score_current_span(
name="tone",
value="professional",
data_type="CATEGORICAL"
)
return output
import { startActiveObservation } from "@abvdev/tracing";
const abv = new ABVClient({ apiKey: "sk-abv-..." });
await startActiveObservation("generate-response", async (span) => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: "What is AI?" }],
});
const output = response.choices[0].message.content;
// Score this observation
abv.score.activeObservation({
name: "relevance",
value: 0.95,
comment: "Highly relevant response",
});
abv.score.activeObservation({
name: "is_helpful",
value: 1.0,
});
span.update({ output });
return output;
});
await abv.score.flush();
Score a Trace
Apply scores to the entire trace (multiple observations).- Python
- JavaScript
@observe()
def full_pipeline(query: str) -> str:
# Multiple operations happen here...
result = process(query)
# Score the entire trace
abv.score_current_trace(
name="user_satisfaction",
value=4.5,
data_type="NUMERIC",
comment="User rated 4.5/5 stars"
)
abv.score_current_trace(
name="task_completed",
value=1.0,
data_type="BOOLEAN"
)
return result
await startActiveObservation("full-pipeline", async (span) => {
const result = await process(query);
// Score entire trace
abv.score.activeTrace({
name: "user_satisfaction",
value: 4.5,
comment: "User rated 4.5/5 stars",
});
abv.score.activeTrace({
name: "task_completed",
value: 1.0,
});
return result;
});
Create a Dataset
Create evaluation datasets to systematically test your LLM.- Python
- JavaScript
# Create dataset
dataset = abv.create_dataset(
name="customer-support-eval",
description="Test cases for customer support bot",
metadata={"version": "1.0", "domain": "support"}
)
# Add test items
test_cases = [
{
"input": {"query": "How do I reset my password?"},
"expected_output": "Go to Settings > Security > Reset Password"
},
{
"input": {"query": "What are your business hours?"},
"expected_output": "We're open Monday-Friday, 9 AM - 5 PM EST"
},
{
"input": {"query": "I want a refund"},
"expected_output": "I'll help you with the refund process..."
}
]
for case in test_cases:
abv.create_dataset_item(
dataset_name="customer-support-eval",
input=case["input"],
expected_output=case["expected_output"],
metadata={"category": "faq"}
)
// Create dataset
await abv.api.datasets.create({
name: "customer-support-eval",
description: "Test cases for customer support bot",
metadata: { version: "1.0", domain: "support" },
});
// Add test items
const testCases = [
{
input: { query: "How do I reset my password?" },
expectedOutput: "Go to Settings > Security > Reset Password",
},
{
input: { query: "What are your business hours?" },
expectedOutput: "We're open Monday-Friday, 9 AM - 5 PM EST",
},
{
input: { query: "I want a refund" },
expectedOutput: "I'll help you with the refund process...",
},
];
for (const testCase of testCases) {
await abv.api.datasetItems.create({
datasetName: "customer-support-eval",
input: testCase.input,
expectedOutput: testCase.expectedOutput,
metadata: { category: "faq" },
});
}
Run Dataset Evaluation
Iterate through dataset items and score results.- Python
- JavaScript
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
def evaluate_similarity(expected: str, actual: str) -> float:
"""Simple overlap-based similarity."""
expected_words = set(expected.lower().split())
actual_words = set(actual.lower().split())
overlap = len(expected_words & actual_words)
return overlap / max(len(expected_words), 1)
@observe()
def run_evaluation():
dataset = abv.get_dataset("customer-support-eval")
results = []
for item in dataset.items:
with item.run(
run_name="eval-v1",
run_description="Baseline evaluation"
) as span:
# Generate response
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": item.input["query"]}]
)
actual_output = response.choices[0].message.content
# Score against expected
similarity = evaluate_similarity(
item.expected_output,
actual_output
)
span.score(
name="similarity",
value=similarity,
data_type="NUMERIC"
)
span.score(
name="matches_expected",
value=1.0 if similarity > 0.7 else 0.0,
data_type="BOOLEAN"
)
results.append({
"input": item.input,
"expected": item.expected_output,
"actual": actual_output,
"similarity": similarity
})
return results
results = run_evaluation()
avg_similarity = sum(r["similarity"] for r in results) / len(results)
print(f"Average similarity: {avg_similarity:.2%}")
import { startActiveObservation } from "@abvdev/tracing";
function evaluateSimilarity(expected: string, actual: string): number {
const expectedWords = new Set(expected.toLowerCase().split(/\s+/));
const actualWords = new Set(actual.toLowerCase().split(/\s+/));
const overlap = [...expectedWords].filter((w) => actualWords.has(w)).length;
return overlap / Math.max(expectedWords.size, 1);
}
await startActiveObservation("run-evaluation", async () => {
const dataset = await abv.dataset.get("customer-support-eval");
const results = [];
for (const item of dataset.items) {
await startActiveObservation(`eval-item-${item.id}`, async (span) => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: item.input.query }],
});
const actualOutput = response.choices[0].message.content;
const similarity = evaluateSimilarity(
item.expectedOutput,
actualOutput
);
abv.score.activeObservation({
name: "similarity",
value: similarity,
});
abv.score.activeObservation({
name: "matches_expected",
value: similarity > 0.7 ? 1.0 : 0.0,
});
// Link to dataset item
await item.link(
{ otelSpan: span },
"eval-v1",
{ description: "Baseline evaluation" }
);
results.push({ similarity });
});
}
const avgSimilarity =
results.reduce((a, b) => a + b.similarity, 0) / results.length;
console.log(`Average similarity: ${(avgSimilarity * 100).toFixed(1)}%`);
});
await abv.score.flush();
LLM-as-Judge Scoring
Use an LLM to evaluate output quality.- Python
- JavaScript
import json
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
JUDGE_PROMPT = """Evaluate the following response for quality.
User Query: {query}
Response: {response}
Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?
Respond in JSON format:
{{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}}
"""
@observe(name="llm-judge", as_type="evaluator")
def judge_response(query: str, response: str) -> dict:
judge_response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o",
messages=[{
"role": "user",
"content": JUDGE_PROMPT.format(query=query, response=response)
}],
temperature=0 # Deterministic evaluation
)
scores = json.loads(judge_response.choices[0].message.content)
# Record individual scores
for criterion in ["relevance", "accuracy", "clarity", "helpfulness"]:
abv.score_current_span(
name=criterion,
value=scores[criterion] / 10, # Normalize to 0-1
data_type="NUMERIC"
)
# Overall score
overall = sum(scores[c] for c in ["relevance", "accuracy", "clarity", "helpfulness"]) / 4
abv.score_current_span(
name="overall_quality",
value=overall / 10,
data_type="NUMERIC",
comment=scores.get("reasoning", "")
)
return scores
# Use in evaluation pipeline
@observe()
def evaluate_with_judge(query: str) -> dict:
# Generate response
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": query}]
)
output = response.choices[0].message.content
# Judge the response
scores = judge_response(query, output)
return {"response": output, "scores": scores}
import { startActiveObservation, startObservation } from "@abvdev/tracing";
const JUDGE_PROMPT = `Evaluate the following response for quality.
User Query: {query}
Response: {response}
Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?
Respond in JSON format:
{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}`;
async function judgeResponse(query: string, response: string) {
return startActiveObservation(
"llm-judge",
async () => {
const judgeResult = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o",
messages: [
{
role: "user",
content: JUDGE_PROMPT.replace("{query}", query).replace(
"{response}",
response
),
},
],
temperature: 0,
});
const scores = JSON.parse(judgeResult.choices[0].message.content);
// Record scores
for (const criterion of [
"relevance",
"accuracy",
"clarity",
"helpfulness",
]) {
abv.score.activeObservation({
name: criterion,
value: scores[criterion] / 10,
});
}
const overall =
(scores.relevance +
scores.accuracy +
scores.clarity +
scores.helpfulness) /
4;
abv.score.activeObservation({
name: "overall_quality",
value: overall / 10,
comment: scores.reasoning,
});
return scores;
},
{ asType: "evaluator" }
);
}
// Usage
await startActiveObservation("evaluate-with-judge", async () => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: "What is machine learning?" }],
});
const output = response.choices[0].message.content;
const scores = await judgeResponse("What is machine learning?", output);
return { response: output, scores };
});
Batch Scoring
Efficiently score multiple traces/observations.- Python
- JavaScript
# Create scores in batch (automatically batched and flushed)
scores_to_create = [
{"trace_id": "trace-1", "name": "quality", "value": 0.9},
{"trace_id": "trace-2", "name": "quality", "value": 0.85},
{"trace_id": "trace-3", "name": "quality", "value": 0.95},
]
for score in scores_to_create:
abv.create_score(
trace_id=score["trace_id"],
name=score["name"],
value=score["value"],
data_type="NUMERIC"
)
# Force flush if needed
abv.flush()
const scoresToCreate = [
{ traceId: "trace-1", name: "quality", value: 0.9 },
{ traceId: "trace-2", name: "quality", value: 0.85 },
{ traceId: "trace-3", name: "quality", value: 0.95 },
];
for (const score of scoresToCreate) {
abv.score.create({
traceId: score.traceId,
name: score.name,
value: score.value,
});
}
// Scores are batched automatically
// Force flush if needed
await abv.score.flush();
A/B Test Prompts
Compare different prompts using dataset evaluation.- Python
- JavaScript
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
PROMPTS = {
"concise": "Answer briefly: {query}",
"detailed": "Provide a comprehensive answer with examples: {query}",
"step_by_step": "Answer step by step: {query}"
}
@observe()
def ab_test_prompts(dataset_name: str):
dataset = abv.get_dataset(dataset_name)
results = {name: [] for name in PROMPTS}
for item in dataset.items:
for prompt_name, prompt_template in PROMPTS.items():
with abv.start_as_current_span(name=f"test-{prompt_name}") as span:
prompt = prompt_template.format(query=item.input["query"])
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
# Score response
scores = judge_response(item.input["query"], response.choices[0].message.content)
results[prompt_name].append(scores["overall_quality"])
span.update(metadata={"prompt_variant": prompt_name})
# Print comparison
for name, scores in results.items():
avg = sum(scores) / len(scores)
print(f"{name}: {avg:.2f} average quality")
ab_test_prompts("customer-support-eval")
const PROMPTS = {
concise: "Answer briefly: {query}",
detailed: "Provide a comprehensive answer with examples: {query}",
step_by_step: "Answer step by step: {query}",
};
await startActiveObservation("ab-test-prompts", async () => {
const dataset = await abv.dataset.get("customer-support-eval");
const results: Record<string, number[]> = {};
for (const [name] of Object.entries(PROMPTS)) {
results[name] = [];
}
for (const item of dataset.items) {
for (const [promptName, promptTemplate] of Object.entries(PROMPTS)) {
await startActiveObservation(`test-${promptName}`, async (span) => {
const prompt = promptTemplate.replace("{query}", item.input.query);
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: prompt }],
});
const scores = await judgeResponse(
item.input.query,
response.choices[0].message.content
);
results[promptName].push(scores.overall_quality);
span.update({ metadata: { prompt_variant: promptName } });
});
}
}
// Print comparison
for (const [name, scores] of Object.entries(results)) {
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
console.log(`${name}: ${avg.toFixed(2)} average quality`);
}
});
Integration Patterns
Next: Combining features
Evaluations Guide
Reference: Full evaluations documentation