Evaluation Patterns

Evaluations help you measure and improve LLM output quality through datasets, automated scoring, and experiments.

Score an Observation

Add quality scores to any traced observation.

Python
JavaScript

from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

@observe()
def generate_response(query: str) -> str:
    response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}]
    )
    output = response.choices[0].message.content

    # Score this observation
    abv.score_current_span(
        name="relevance",
        value=0.95,
        data_type="NUMERIC",
        comment="Highly relevant response"
    )

    # Boolean score
    abv.score_current_span(
        name="is_helpful",
        value=1.0,
        data_type="BOOLEAN"
    )

    # Categorical score
    abv.score_current_span(
        name="tone",
        value="professional",
        data_type="CATEGORICAL"
    )

    return output

import { startActiveObservation } from "@abvdev/tracing";

const abv = new ABVClient({ apiKey: "sk-abv-..." });

await startActiveObservation("generate-response", async (span) => {
  const response = await abv.gateway.chat.completions.create({
    provider: "openai",
    model: "gpt-4o-mini",
    messages: [{ role: "user", content: "What is AI?" }],
  });
  const output = response.choices[0].message.content;

  // Score this observation
  abv.score.activeObservation({
    name: "relevance",
    value: 0.95,
    comment: "Highly relevant response",
  });

  abv.score.activeObservation({
    name: "is_helpful",
    value: 1.0,
  });

  span.update({ output });
  return output;
});

await abv.score.flush();

Score a Trace

Apply scores to the entire trace (multiple observations).

Python
JavaScript

@observe()
def full_pipeline(query: str) -> str:
    # Multiple operations happen here...
    result = process(query)

    # Score the entire trace
    abv.score_current_trace(
        name="user_satisfaction",
        value=4.5,
        data_type="NUMERIC",
        comment="User rated 4.5/5 stars"
    )

    abv.score_current_trace(
        name="task_completed",
        value=1.0,
        data_type="BOOLEAN"
    )

    return result

await startActiveObservation("full-pipeline", async (span) => {
  const result = await process(query);

  // Score entire trace
  abv.score.activeTrace({
    name: "user_satisfaction",
    value: 4.5,
    comment: "User rated 4.5/5 stars",
  });

  abv.score.activeTrace({
    name: "task_completed",
    value: 1.0,
  });

  return result;
});

Create a Dataset

Create evaluation datasets to systematically test your LLM.

Python
JavaScript

# Create dataset
dataset = abv.create_dataset(
    name="customer-support-eval",
    description="Test cases for customer support bot",
    metadata={"version": "1.0", "domain": "support"}
)

# Add test items
test_cases = [
    {
        "input": {"query": "How do I reset my password?"},
        "expected_output": "Go to Settings > Security > Reset Password"
    },
    {
        "input": {"query": "What are your business hours?"},
        "expected_output": "We're open Monday-Friday, 9 AM - 5 PM EST"
    },
    {
        "input": {"query": "I want a refund"},
        "expected_output": "I'll help you with the refund process..."
    }
]

for case in test_cases:
    abv.create_dataset_item(
        dataset_name="customer-support-eval",
        input=case["input"],
        expected_output=case["expected_output"],
        metadata={"category": "faq"}
    )

// Create dataset
await abv.api.datasets.create({
  name: "customer-support-eval",
  description: "Test cases for customer support bot",
  metadata: { version: "1.0", domain: "support" },
});

// Add test items
const testCases = [
  {
    input: { query: "How do I reset my password?" },
    expectedOutput: "Go to Settings > Security > Reset Password",
  },
  {
    input: { query: "What are your business hours?" },
    expectedOutput: "We're open Monday-Friday, 9 AM - 5 PM EST",
  },
  {
    input: { query: "I want a refund" },
    expectedOutput: "I'll help you with the refund process...",
  },
];

for (const testCase of testCases) {
  await abv.api.datasetItems.create({
    datasetName: "customer-support-eval",
    input: testCase.input,
    expectedOutput: testCase.expectedOutput,
    metadata: { category: "faq" },
  });
}

Run Dataset Evaluation

Iterate through dataset items and score results.

Python
JavaScript

from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

def evaluate_similarity(expected: str, actual: str) -> float:
    """Simple overlap-based similarity."""
    expected_words = set(expected.lower().split())
    actual_words = set(actual.lower().split())
    overlap = len(expected_words & actual_words)
    return overlap / max(len(expected_words), 1)

@observe()
def run_evaluation():
    dataset = abv.get_dataset("customer-support-eval")

    results = []
    for item in dataset.items:
        with item.run(
            run_name="eval-v1",
            run_description="Baseline evaluation"
        ) as span:
            # Generate response
            response = abv.gateway.complete_chat(
                provider="openai",
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": item.input["query"]}]
            )
            actual_output = response.choices[0].message.content

            # Score against expected
            similarity = evaluate_similarity(
                item.expected_output,
                actual_output
            )

            span.score(
                name="similarity",
                value=similarity,
                data_type="NUMERIC"
            )

            span.score(
                name="matches_expected",
                value=1.0 if similarity > 0.7 else 0.0,
                data_type="BOOLEAN"
            )

            results.append({
                "input": item.input,
                "expected": item.expected_output,
                "actual": actual_output,
                "similarity": similarity
            })

    return results

results = run_evaluation()
avg_similarity = sum(r["similarity"] for r in results) / len(results)
print(f"Average similarity: {avg_similarity:.2%}")

import { startActiveObservation } from "@abvdev/tracing";

function evaluateSimilarity(expected: string, actual: string): number {
  const expectedWords = new Set(expected.toLowerCase().split(/\s+/));
  const actualWords = new Set(actual.toLowerCase().split(/\s+/));
  const overlap = [...expectedWords].filter((w) => actualWords.has(w)).length;
  return overlap / Math.max(expectedWords.size, 1);
}

await startActiveObservation("run-evaluation", async () => {
  const dataset = await abv.dataset.get("customer-support-eval");
  const results = [];

  for (const item of dataset.items) {
    await startActiveObservation(`eval-item-${item.id}`, async (span) => {
      const response = await abv.gateway.chat.completions.create({
        provider: "openai",
        model: "gpt-4o-mini",
        messages: [{ role: "user", content: item.input.query }],
      });
      const actualOutput = response.choices[0].message.content;

      const similarity = evaluateSimilarity(
        item.expectedOutput,
        actualOutput
      );

      abv.score.activeObservation({
        name: "similarity",
        value: similarity,
      });

      abv.score.activeObservation({
        name: "matches_expected",
        value: similarity > 0.7 ? 1.0 : 0.0,
      });

      // Link to dataset item
      await item.link(
        { otelSpan: span },
        "eval-v1",
        { description: "Baseline evaluation" }
      );

      results.push({ similarity });
    });
  }

  const avgSimilarity =
    results.reduce((a, b) => a + b.similarity, 0) / results.length;
  console.log(`Average similarity: ${(avgSimilarity * 100).toFixed(1)}%`);
});

await abv.score.flush();

LLM-as-Judge Scoring

Use an LLM to evaluate output quality.

Python
JavaScript

import json
from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

JUDGE_PROMPT = """Evaluate the following response for quality.

User Query: {query}
Response: {response}

Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?

Respond in JSON format:
{{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}}
"""

@observe(name="llm-judge", as_type="evaluator")
def judge_response(query: str, response: str) -> dict:
    judge_response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": JUDGE_PROMPT.format(query=query, response=response)
        }],
        temperature=0  # Deterministic evaluation
    )

    scores = json.loads(judge_response.choices[0].message.content)

    # Record individual scores
    for criterion in ["relevance", "accuracy", "clarity", "helpfulness"]:
        abv.score_current_span(
            name=criterion,
            value=scores[criterion] / 10,  # Normalize to 0-1
            data_type="NUMERIC"
        )

    # Overall score
    overall = sum(scores[c] for c in ["relevance", "accuracy", "clarity", "helpfulness"]) / 4
    abv.score_current_span(
        name="overall_quality",
        value=overall / 10,
        data_type="NUMERIC",
        comment=scores.get("reasoning", "")
    )

    return scores

# Use in evaluation pipeline
@observe()
def evaluate_with_judge(query: str) -> dict:
    # Generate response
    response = abv.gateway.complete_chat(
        provider="openai",
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}]
    )
    output = response.choices[0].message.content

    # Judge the response
    scores = judge_response(query, output)

    return {"response": output, "scores": scores}

import { startActiveObservation, startObservation } from "@abvdev/tracing";

const JUDGE_PROMPT = `Evaluate the following response for quality.

User Query: {query}
Response: {response}

Score each criterion from 0-10:
1. Relevance: Does the response answer the query?
2. Accuracy: Is the information correct?
3. Clarity: Is the response clear and well-structured?
4. Helpfulness: Does it provide actionable information?

Respond in JSON format:
{"relevance": N, "accuracy": N, "clarity": N, "helpfulness": N, "reasoning": "..."}`;

async function judgeResponse(query: string, response: string) {
  return startActiveObservation(
    "llm-judge",
    async () => {
      const judgeResult = await abv.gateway.chat.completions.create({
        provider: "openai",
        model: "gpt-4o",
        messages: [
          {
            role: "user",
            content: JUDGE_PROMPT.replace("{query}", query).replace(
              "{response}",
              response
            ),
          },
        ],
        temperature: 0,
      });

      const scores = JSON.parse(judgeResult.choices[0].message.content);

      // Record scores
      for (const criterion of [
        "relevance",
        "accuracy",
        "clarity",
        "helpfulness",
      ]) {
        abv.score.activeObservation({
          name: criterion,
          value: scores[criterion] / 10,
        });
      }

      const overall =
        (scores.relevance +
          scores.accuracy +
          scores.clarity +
          scores.helpfulness) /
        4;
      abv.score.activeObservation({
        name: "overall_quality",
        value: overall / 10,
        comment: scores.reasoning,
      });

      return scores;
    },
    { asType: "evaluator" }
  );
}

// Usage
await startActiveObservation("evaluate-with-judge", async () => {
  const response = await abv.gateway.chat.completions.create({
    provider: "openai",
    model: "gpt-4o-mini",
    messages: [{ role: "user", content: "What is machine learning?" }],
  });
  const output = response.choices[0].message.content;

  const scores = await judgeResponse("What is machine learning?", output);
  return { response: output, scores };
});

Batch Scoring

Efficiently score multiple traces/observations.

Python
JavaScript

# Create scores in batch (automatically batched and flushed)
scores_to_create = [
    {"trace_id": "trace-1", "name": "quality", "value": 0.9},
    {"trace_id": "trace-2", "name": "quality", "value": 0.85},
    {"trace_id": "trace-3", "name": "quality", "value": 0.95},
]

for score in scores_to_create:
    abv.create_score(
        trace_id=score["trace_id"],
        name=score["name"],
        value=score["value"],
        data_type="NUMERIC"
    )

# Force flush if needed
abv.flush()

const scoresToCreate = [
  { traceId: "trace-1", name: "quality", value: 0.9 },
  { traceId: "trace-2", name: "quality", value: 0.85 },
  { traceId: "trace-3", name: "quality", value: 0.95 },
];

for (const score of scoresToCreate) {
  abv.score.create({
    traceId: score.traceId,
    name: score.name,
    value: score.value,
  });
}

// Scores are batched automatically
// Force flush if needed
await abv.score.flush();

A/B Test Prompts

Compare different prompts using dataset evaluation.

Python
JavaScript

from abvdev import ABV, observe

abv = ABV(api_key="sk-abv-...")

PROMPTS = {
    "concise": "Answer briefly: {query}",
    "detailed": "Provide a comprehensive answer with examples: {query}",
    "step_by_step": "Answer step by step: {query}"
}

@observe()
def ab_test_prompts(dataset_name: str):
    dataset = abv.get_dataset(dataset_name)
    results = {name: [] for name in PROMPTS}

    for item in dataset.items:
        for prompt_name, prompt_template in PROMPTS.items():
            with abv.start_as_current_span(name=f"test-{prompt_name}") as span:
                prompt = prompt_template.format(query=item.input["query"])

                response = abv.gateway.complete_chat(
                    provider="openai",
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": prompt}]
                )

                # Score response
                scores = judge_response(item.input["query"], response.choices[0].message.content)
                results[prompt_name].append(scores["overall_quality"])

                span.update(metadata={"prompt_variant": prompt_name})

    # Print comparison
    for name, scores in results.items():
        avg = sum(scores) / len(scores)
        print(f"{name}: {avg:.2f} average quality")

ab_test_prompts("customer-support-eval")

const PROMPTS = {
  concise: "Answer briefly: {query}",
  detailed: "Provide a comprehensive answer with examples: {query}",
  step_by_step: "Answer step by step: {query}",
};

await startActiveObservation("ab-test-prompts", async () => {
  const dataset = await abv.dataset.get("customer-support-eval");
  const results: Record<string, number[]> = {};

  for (const [name] of Object.entries(PROMPTS)) {
    results[name] = [];
  }

  for (const item of dataset.items) {
    for (const [promptName, promptTemplate] of Object.entries(PROMPTS)) {
      await startActiveObservation(`test-${promptName}`, async (span) => {
        const prompt = promptTemplate.replace("{query}", item.input.query);

        const response = await abv.gateway.chat.completions.create({
          provider: "openai",
          model: "gpt-4o-mini",
          messages: [{ role: "user", content: prompt }],
        });

        const scores = await judgeResponse(
          item.input.query,
          response.choices[0].message.content
        );
        results[promptName].push(scores.overall_quality);

        span.update({ metadata: { prompt_variant: promptName } });
      });
    }
  }

  // Print comparison
  for (const [name, scores] of Object.entries(results)) {
    const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
    console.log(`${name}: ${avg.toFixed(2)} average quality`);
  }
});

Integration Patterns

Next: Combining features

Evaluations Guide

Reference: Full evaluations documentation

Getting Started

Basic Features

LLM Gateway

Guardrails

Evaluations

Prompt Management

Cookbook

SDKs

Platform

Support

Score an Observation

Score a Trace

Create a Dataset

Run Dataset Evaluation

LLM-as-Judge Scoring

Batch Scoring

A/B Test Prompts

Integration Patterns

Evaluations Guide

Getting Started

Basic Features

LLM Gateway

Guardrails

Evaluations

Prompt Management

Cookbook

SDKs

Platform

Support

​Score an Observation

​Score a Trace

​Create a Dataset

​Run Dataset Evaluation

​LLM-as-Judge Scoring

​Batch Scoring

​A/B Test Prompts

Integration Patterns

Evaluations Guide

Score an Observation

Score a Trace

Create a Dataset

Run Dataset Evaluation

LLM-as-Judge Scoring

Batch Scoring

A/B Test Prompts