Skip to main content
Evaluations help you measure and improve LLM output quality through datasets, automated scoring, and experiments.

Score an Observation

Add quality scores to any traced observation.
from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

@observe()
def generate_response(query: str) -> str:
    response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}]
    )
    output = response.choices[0].message.content

    # Score this observation
    abv.score_current_span(
        name="relevance",
        value=0.95,
        data_type="NUMERIC",
        comment="Highly relevant response"
    )

    # Boolean score
    abv.score_current_span(
        name="is_helpful",
        value=1.0,
        data_type="BOOLEAN"
    )

    # Categorical score
    abv.score_current_span(
        name="tone",
        value="professional",
        data_type="CATEGORICAL"
    )

    return output

Score a Trace

Apply scores to the entire trace (multiple observations).
@observe()
def full_pipeline(query: str) -> str:
    # Multiple operations happen here...
    result = process(query)

    # Score the entire trace
    abv.score_current_trace(
        name="user_satisfaction",
        value=4.5,
        data_type="NUMERIC",
        comment="User rated 4.5/5 stars"
    )

    abv.score_current_trace(
        name="task_completed",
        value=1.0,
        data_type="BOOLEAN"
    )

    return result

Create a Dataset

Create evaluation datasets to systematically test your LLM.
# Create dataset
dataset = abv.create_dataset(
    name="customer-support-eval",
    description="Test cases for customer support bot",
    metadata={"version": "1.0", "domain": "support"}
)

# Add test items
test_cases = [
    {
        "input": {"query": "How do I reset my password?"},
        "expected_output": "Go to Settings > Security > Reset Password"
    },
    {
        "input": {"query": "What are your business hours?"},
        "expected_output": "We're open Monday-Friday, 9 AM - 5 PM EST"
    },
    {
        "input": {"query": "I want a refund"},
        "expected_output": "I'll help you with the refund process..."
    }
]

for case in test_cases:
    abv.create_dataset_item(
        dataset_name="customer-support-eval",
        input=case["input"],
        expected_output=case["expected_output"],
        metadata={"category": "faq"}
    )

Run Dataset Evaluation

Iterate through dataset items and score results.
from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

def evaluate_similarity(expected: str, actual: str) -> float:
    """Simple overlap-based similarity."""
    expected_words = set(expected.lower().split())
    actual_words = set(actual.lower().split())
    overlap = len(expected_words & actual_words)
    return overlap / max(len(expected_words), 1)

@observe()
def run_evaluation():
    dataset = abv.get_dataset("customer-support-eval")

    results = []
    for item in dataset.items:
        with item.run(
            run_name="eval-v1",
            run_description="Baseline evaluation"
        ) as span:
            # Generate response
            response = abv.gateway.complete_chat(
                provider="openai",
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": item.input["query"]}]
            )
            actual_output = response.choices[0].message.content

            # Score against expected
            similarity = evaluate_similarity(
                item.expected_output,
                actual_output
            )

            span.score(
                name="similarity",
                value=similarity,
                data_type="NUMERIC"
            )

            span.score(
                name="matches_expected",
                value=1.0 if similarity > 0.7 else 0.0,
                data_type="BOOLEAN"
            )

            results.append({
                "input": item.input,
                "expected": item.expected_output,
                "actual": actual_output,
                "similarity": similarity
            })

    return results

results = run_evaluation()
avg_similarity = sum(r["similarity"] for r in results) / len(results)
print(f"Average similarity: {avg_similarity:.2%}")

LLM-as-Judge Scoring

Use an LLM to evaluate output quality.
import json
from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

JUDGE_PROMPT = """Evaluate the following response for quality.

User Query: {query}
Response: {response}

Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?

Respond in JSON format:
{{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}}
"""

@observe(name="llm-judge", as_type="evaluator")
def judge_response(query: str, response: str) -> dict:
    judge_response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": JUDGE_PROMPT.format(query=query, response=response)
        }],
        temperature=0  # Deterministic evaluation
    )

    scores = json.loads(judge_response.choices[0].message.content)

    # Record individual scores
    for criterion in ["relevance", "accuracy", "clarity", "helpfulness"]:
        abv.score_current_span(
            name=criterion,
            value=scores[criterion] / 10,  # Normalize to 0-1
            data_type="NUMERIC"
        )

    # Overall score
    overall = sum(scores[c] for c in ["relevance", "accuracy", "clarity", "helpfulness"]) / 4
    abv.score_current_span(
        name="overall_quality",
        value=overall / 10,
        data_type="NUMERIC",
        comment=scores.get("reasoning", "")
    )

    return scores

# Use in evaluation pipeline
@observe()
def evaluate_with_judge(query: str) -> dict:
    # Generate response
    response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}]
    )
    output = response.choices[0].message.content

    # Judge the response
    scores = judge_response(query, output)

    return {"response": output, "scores": scores}

Batch Scoring

Efficiently score multiple traces/observations.
# Create scores in batch (automatically batched and flushed)
scores_to_create = [
    {"trace_id": "trace-1", "name": "quality", "value": 0.9},
    {"trace_id": "trace-2", "name": "quality", "value": 0.85},
    {"trace_id": "trace-3", "name": "quality", "value": 0.95},
]

for score in scores_to_create:
    abv.create_score(
        trace_id=score["trace_id"],
        name=score["name"],
        value=score["value"],
        data_type="NUMERIC"
    )

# Force flush if needed
abv.flush()

A/B Test Prompts

Compare different prompts using dataset evaluation.
from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

PROMPTS = {
    "concise": "Answer briefly: {query}",
    "detailed": "Provide a comprehensive answer with examples: {query}",
    "step_by_step": "Answer step by step: {query}"
}

@observe()
def ab_test_prompts(dataset_name: str):
    dataset = abv.get_dataset(dataset_name)
    results = {name: [] for name in PROMPTS}

    for item in dataset.items:
        for prompt_name, prompt_template in PROMPTS.items():
            with abv.start_as_current_span(name=f"test-{prompt_name}") as span:
                prompt = prompt_template.format(query=item.input["query"])

                response = abv.gateway.complete_chat(
                    provider="openai",
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": prompt}]
                )

                # Score response
                scores = judge_response(item.input["query"], response.choices[0].message.content)
                results[prompt_name].append(scores["overall_quality"])

                span.update(metadata={"prompt_variant": prompt_name})

    # Print comparison
    for name, scores in results.items():
        avg = sum(scores) / len(scores)
        print(f"{name}: {avg:.2f} average quality")

ab_test_prompts("customer-support-eval")