Evaluations help you measure and improve LLM output quality through datasets, automated scoring, and experiments.
Score an Observation
Add quality scores to any traced observation.- Python
- JavaScript
Copy
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
@observe()
def generate_response(query: str) -> str:
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": query}]
)
output = response.choices[0].message.content
# Score this observation
abv.score_current_span(
name="relevance",
value=0.95,
data_type="NUMERIC",
comment="Highly relevant response"
)
# Boolean score
abv.score_current_span(
name="is_helpful",
value=1.0,
data_type="BOOLEAN"
)
# Categorical score
abv.score_current_span(
name="tone",
value="professional",
data_type="CATEGORICAL"
)
return output
Copy
import { startActiveObservation } from "@abvdev/tracing";
const abv = new ABVClient({ apiKey: "sk-abv-..." });
await startActiveObservation("generate-response", async (span) => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: "What is AI?" }],
});
const output = response.choices[0].message.content;
// Score this observation
abv.score.activeObservation({
name: "relevance",
value: 0.95,
comment: "Highly relevant response",
});
abv.score.activeObservation({
name: "is_helpful",
value: 1.0,
});
span.update({ output });
return output;
});
await abv.score.flush();
Score a Trace
Apply scores to the entire trace (multiple observations).- Python
- JavaScript
Copy
@observe()
def full_pipeline(query: str) -> str:
# Multiple operations happen here...
result = process(query)
# Score the entire trace
abv.score_current_trace(
name="user_satisfaction",
value=4.5,
data_type="NUMERIC",
comment="User rated 4.5/5 stars"
)
abv.score_current_trace(
name="task_completed",
value=1.0,
data_type="BOOLEAN"
)
return result
Copy
await startActiveObservation("full-pipeline", async (span) => {
const result = await process(query);
// Score entire trace
abv.score.activeTrace({
name: "user_satisfaction",
value: 4.5,
comment: "User rated 4.5/5 stars",
});
abv.score.activeTrace({
name: "task_completed",
value: 1.0,
});
return result;
});
Create a Dataset
Create evaluation datasets to systematically test your LLM.- Python
- JavaScript
Copy
# Create dataset
dataset = abv.create_dataset(
name="customer-support-eval",
description="Test cases for customer support bot",
metadata={"version": "1.0", "domain": "support"}
)
# Add test items
test_cases = [
{
"input": {"query": "How do I reset my password?"},
"expected_output": "Go to Settings > Security > Reset Password"
},
{
"input": {"query": "What are your business hours?"},
"expected_output": "We're open Monday-Friday, 9 AM - 5 PM EST"
},
{
"input": {"query": "I want a refund"},
"expected_output": "I'll help you with the refund process..."
}
]
for case in test_cases:
abv.create_dataset_item(
dataset_name="customer-support-eval",
input=case["input"],
expected_output=case["expected_output"],
metadata={"category": "faq"}
)
Copy
// Create dataset
await abv.api.datasets.create({
name: "customer-support-eval",
description: "Test cases for customer support bot",
metadata: { version: "1.0", domain: "support" },
});
// Add test items
const testCases = [
{
input: { query: "How do I reset my password?" },
expectedOutput: "Go to Settings > Security > Reset Password",
},
{
input: { query: "What are your business hours?" },
expectedOutput: "We're open Monday-Friday, 9 AM - 5 PM EST",
},
{
input: { query: "I want a refund" },
expectedOutput: "I'll help you with the refund process...",
},
];
for (const testCase of testCases) {
await abv.api.datasetItems.create({
datasetName: "customer-support-eval",
input: testCase.input,
expectedOutput: testCase.expectedOutput,
metadata: { category: "faq" },
});
}
Run Dataset Evaluation
Iterate through dataset items and score results.- Python
- JavaScript
Copy
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
def evaluate_similarity(expected: str, actual: str) -> float:
"""Simple overlap-based similarity."""
expected_words = set(expected.lower().split())
actual_words = set(actual.lower().split())
overlap = len(expected_words & actual_words)
return overlap / max(len(expected_words), 1)
@observe()
def run_evaluation():
dataset = abv.get_dataset("customer-support-eval")
results = []
for item in dataset.items:
with item.run(
run_name="eval-v1",
run_description="Baseline evaluation"
) as span:
# Generate response
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": item.input["query"]}]
)
actual_output = response.choices[0].message.content
# Score against expected
similarity = evaluate_similarity(
item.expected_output,
actual_output
)
span.score(
name="similarity",
value=similarity,
data_type="NUMERIC"
)
span.score(
name="matches_expected",
value=1.0 if similarity > 0.7 else 0.0,
data_type="BOOLEAN"
)
results.append({
"input": item.input,
"expected": item.expected_output,
"actual": actual_output,
"similarity": similarity
})
return results
results = run_evaluation()
avg_similarity = sum(r["similarity"] for r in results) / len(results)
print(f"Average similarity: {avg_similarity:.2%}")
Copy
import { startActiveObservation } from "@abvdev/tracing";
function evaluateSimilarity(expected: string, actual: string): number {
const expectedWords = new Set(expected.toLowerCase().split(/\s+/));
const actualWords = new Set(actual.toLowerCase().split(/\s+/));
const overlap = [...expectedWords].filter((w) => actualWords.has(w)).length;
return overlap / Math.max(expectedWords.size, 1);
}
await startActiveObservation("run-evaluation", async () => {
const dataset = await abv.dataset.get("customer-support-eval");
const results = [];
for (const item of dataset.items) {
await startActiveObservation(`eval-item-${item.id}`, async (span) => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: item.input.query }],
});
const actualOutput = response.choices[0].message.content;
const similarity = evaluateSimilarity(
item.expectedOutput,
actualOutput
);
abv.score.activeObservation({
name: "similarity",
value: similarity,
});
abv.score.activeObservation({
name: "matches_expected",
value: similarity > 0.7 ? 1.0 : 0.0,
});
// Link to dataset item
await item.link(
{ otelSpan: span },
"eval-v1",
{ description: "Baseline evaluation" }
);
results.push({ similarity });
});
}
const avgSimilarity =
results.reduce((a, b) => a + b.similarity, 0) / results.length;
console.log(`Average similarity: ${(avgSimilarity * 100).toFixed(1)}%`);
});
await abv.score.flush();
LLM-as-Judge Scoring
Use an LLM to evaluate output quality.- Python
- JavaScript
Copy
import json
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
JUDGE_PROMPT = """Evaluate the following response for quality.
User Query: {query}
Response: {response}
Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?
Respond in JSON format:
{{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}}
"""
@observe(name="llm-judge", as_type="evaluator")
def judge_response(query: str, response: str) -> dict:
judge_response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o",
messages=[{
"role": "user",
"content": JUDGE_PROMPT.format(query=query, response=response)
}],
temperature=0 # Deterministic evaluation
)
scores = json.loads(judge_response.choices[0].message.content)
# Record individual scores
for criterion in ["relevance", "accuracy", "clarity", "helpfulness"]:
abv.score_current_span(
name=criterion,
value=scores[criterion] / 10, # Normalize to 0-1
data_type="NUMERIC"
)
# Overall score
overall = sum(scores[c] for c in ["relevance", "accuracy", "clarity", "helpfulness"]) / 4
abv.score_current_span(
name="overall_quality",
value=overall / 10,
data_type="NUMERIC",
comment=scores.get("reasoning", "")
)
return scores
# Use in evaluation pipeline
@observe()
def evaluate_with_judge(query: str) -> dict:
# Generate response
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": query}]
)
output = response.choices[0].message.content
# Judge the response
scores = judge_response(query, output)
return {"response": output, "scores": scores}
Copy
import { startActiveObservation, startObservation } from "@abvdev/tracing";
const JUDGE_PROMPT = `Evaluate the following response for quality.
User Query: {query}
Response: {response}
Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?
Respond in JSON format:
{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}`;
async function judgeResponse(query: string, response: string) {
return startActiveObservation(
"llm-judge",
async () => {
const judgeResult = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o",
messages: [
{
role: "user",
content: JUDGE_PROMPT.replace("{query}", query).replace(
"{response}",
response
),
},
],
temperature: 0,
});
const scores = JSON.parse(judgeResult.choices[0].message.content);
// Record scores
for (const criterion of [
"relevance",
"accuracy",
"clarity",
"helpfulness",
]) {
abv.score.activeObservation({
name: criterion,
value: scores[criterion] / 10,
});
}
const overall =
(scores.relevance +
scores.accuracy +
scores.clarity +
scores.helpfulness) /
4;
abv.score.activeObservation({
name: "overall_quality",
value: overall / 10,
comment: scores.reasoning,
});
return scores;
},
{ asType: "evaluator" }
);
}
// Usage
await startActiveObservation("evaluate-with-judge", async () => {
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: "What is machine learning?" }],
});
const output = response.choices[0].message.content;
const scores = await judgeResponse("What is machine learning?", output);
return { response: output, scores };
});
Batch Scoring
Efficiently score multiple traces/observations.- Python
- JavaScript
Copy
# Create scores in batch (automatically batched and flushed)
scores_to_create = [
{"trace_id": "trace-1", "name": "quality", "value": 0.9},
{"trace_id": "trace-2", "name": "quality", "value": 0.85},
{"trace_id": "trace-3", "name": "quality", "value": 0.95},
]
for score in scores_to_create:
abv.create_score(
trace_id=score["trace_id"],
name=score["name"],
value=score["value"],
data_type="NUMERIC"
)
# Force flush if needed
abv.flush()
Copy
const scoresToCreate = [
{ traceId: "trace-1", name: "quality", value: 0.9 },
{ traceId: "trace-2", name: "quality", value: 0.85 },
{ traceId: "trace-3", name: "quality", value: 0.95 },
];
for (const score of scoresToCreate) {
abv.score.create({
traceId: score.traceId,
name: score.name,
value: score.value,
});
}
// Scores are batched automatically
// Force flush if needed
await abv.score.flush();
A/B Test Prompts
Compare different prompts using dataset evaluation.- Python
- JavaScript
Copy
from abvdev import ABV, observe
abv = ABV(api_key="sk-abv-...")
PROMPTS = {
"concise": "Answer briefly: {query}",
"detailed": "Provide a comprehensive answer with examples: {query}",
"step_by_step": "Answer step by step: {query}"
}
@observe()
def ab_test_prompts(dataset_name: str):
dataset = abv.get_dataset(dataset_name)
results = {name: [] for name in PROMPTS}
for item in dataset.items:
for prompt_name, prompt_template in PROMPTS.items():
with abv.start_as_current_span(name=f"test-{prompt_name}") as span:
prompt = prompt_template.format(query=item.input["query"])
response = abv.gateway.complete_chat(
provider="openai",
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
# Score response
scores = judge_response(item.input["query"], response.choices[0].message.content)
results[prompt_name].append(scores["overall_quality"])
span.update(metadata={"prompt_variant": prompt_name})
# Print comparison
for name, scores in results.items():
avg = sum(scores) / len(scores)
print(f"{name}: {avg:.2f} average quality")
ab_test_prompts("customer-support-eval")
Copy
const PROMPTS = {
concise: "Answer briefly: {query}",
detailed: "Provide a comprehensive answer with examples: {query}",
step_by_step: "Answer step by step: {query}",
};
await startActiveObservation("ab-test-prompts", async () => {
const dataset = await abv.dataset.get("customer-support-eval");
const results: Record<string, number[]> = {};
for (const [name] of Object.entries(PROMPTS)) {
results[name] = [];
}
for (const item of dataset.items) {
for (const [promptName, promptTemplate] of Object.entries(PROMPTS)) {
await startActiveObservation(`test-${promptName}`, async (span) => {
const prompt = promptTemplate.replace("{query}", item.input.query);
const response = await abv.gateway.chat.completions.create({
provider: "openai",
model: "gpt-4o-mini",
messages: [{ role: "user", content: prompt }],
});
const scores = await judgeResponse(
item.input.query,
response.choices[0].message.content
);
results[promptName].push(scores.overall_quality);
span.update({ metadata: { prompt_variant: promptName } });
});
}
}
// Print comparison
for (const [name, scores] of Object.entries(results)) {
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
console.log(`${name}: ${avg.toFixed(2)} average quality`);
}
});