Skip to main content
These patterns show how to combine multiple ABV features for production-grade AI applications.

Validated Chat Pipeline

Combines tracing, input/output guardrails, and scoring.
from abvdev import ABV, observe
from dataclasses import dataclass
from typing import Optional

abv = ABV(api_key="sk-abv-...")

@dataclass
class ChatResult:
    response: Optional[str]
    blocked: bool
    reason: Optional[str]

@observe()
def validated_chat(user_id: str, session_id: str, message: str) -> ChatResult:
    # Set trace context
    abv.update_current_trace(
        user_id=user_id,
        session_id=session_id,
        tags=["chat", "production"]
    )

    # Input guardrail
    with abv.start_as_current_observation(as_type="guardrail", name="input-check") as guard:
        input_check = abv.guardrails.validate_toxic_language(message, {"sensitivity": "medium"})
        guard.update(output={"status": input_check.status})

        if input_check.status == "FAIL":
            abv.score_current_trace(name="blocked", value=1.0, data_type="BOOLEAN")
            return ChatResult(response=None, blocked=True, reason="Input blocked")

    # Generate response
    with abv.start_as_current_generation(name="llm-call", model="gpt-4o-mini") as gen:
        response = abv.gateway.complete_chat(
            provider="openai",
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message}
            ]
        )
        output = response.choices[0].message.content
        gen.update(
            output=output,
            usage_details={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

    # Output guardrail
    with abv.start_as_current_observation(as_type="guardrail", name="output-check") as guard:
        output_check = abv.guardrails.validate_toxic_language(output, {"sensitivity": "high"})
        guard.update(output={"status": output_check.status})

        if output_check.status == "FAIL":
            abv.score_current_trace(name="output_filtered", value=1.0, data_type="BOOLEAN")
            return ChatResult(response=None, blocked=True, reason="Output filtered")

    # Score successful interaction
    abv.score_current_trace(name="completed", value=1.0, data_type="BOOLEAN")
    abv.score_current_trace(name="response_length", value=len(output), data_type="NUMERIC")

    return ChatResult(response=output, blocked=False, reason=None)

result = validated_chat("user-123", "session-456", "What is machine learning?")

Cost-Tracked Generation with Fallback

Combines gateway, cost tracking, and provider fallback.
from abvdev import ABV, observe
from typing import List, Dict

abv = ABV(api_key="sk-abv-...")

PROVIDERS = [
    {"provider": "openai", "model": "gpt-4o-mini", "cost_per_1k_input": 0.00015, "cost_per_1k_output": 0.0006},
    {"provider": "anthropic", "model": "claude-sonnet-4-20250514", "cost_per_1k_input": 0.003, "cost_per_1k_output": 0.015},
]

@observe()
def cost_aware_generation(messages: List[Dict], max_cost: float = 0.10) -> dict:
    abv.update_current_trace(metadata={"max_cost_budget": max_cost})

    for config in PROVIDERS:
        with abv.start_as_current_generation(
            name=f"{config['provider']}-call",
            model=config["model"]
        ) as gen:
            try:
                response = abv.gateway.complete_chat(
                    provider=config["provider"],
                    model=config["model"],
                    messages=messages
                )

                # Calculate cost
                input_tokens = response.usage.prompt_tokens
                output_tokens = response.usage.completion_tokens
                cost = (
                    (input_tokens / 1000) * config["cost_per_1k_input"] +
                    (output_tokens / 1000) * config["cost_per_1k_output"]
                )

                gen.update(
                    output=response.choices[0].message.content,
                    usage_details={
                        "prompt_tokens": input_tokens,
                        "completion_tokens": output_tokens
                    },
                    cost_details={"total_cost": cost, "currency": "USD"}
                )

                # Score cost metrics
                abv.score_current_trace(name="cost_usd", value=cost, data_type="NUMERIC")
                abv.score_current_trace(
                    name="under_budget",
                    value=1.0 if cost <= max_cost else 0.0,
                    data_type="BOOLEAN"
                )

                return {
                    "response": response.choices[0].message.content,
                    "provider": config["provider"],
                    "cost": cost
                }

            except Exception as e:
                gen.update(level="WARNING", status_message=str(e))
                continue

    raise Exception("All providers failed")

Prompt + Guardrail + Evaluation

Fetch managed prompts, validate output, and auto-score.
from abvdev import ABV, observe
import json

abv = ABV(api_key="sk-abv-...")

@observe()
def managed_prompt_pipeline(query: str, user_id: str) -> dict:
    abv.update_current_trace(user_id=user_id, tags=["managed-prompt"])

    # Fetch managed prompt
    prompt = abv.get_prompt("qa-assistant-v2")

    # Compile prompt with variables
    messages = prompt.compile(query=query, context="General knowledge")

    # Generate with prompt config
    with abv.start_as_current_generation(
        name="managed-generation",
        model=prompt.config.get("model", "gpt-4o-mini"),
        metadata={"prompt_name": prompt.name, "prompt_version": prompt.version}
    ) as gen:
        response = abv.gateway.complete_chat(
            provider="openai",
            model=prompt.config.get("model", "gpt-4o-mini"),
            messages=messages,
            temperature=prompt.config.get("temperature", 0.7)
        )
        output = response.choices[0].message.content
        gen.update(output=output)

    # Validate output structure if JSON expected
    if prompt.config.get("expects_json"):
        validation = abv.guardrails.validate_json(output, {"strictMode": True})
        abv.score_current_span(
            name="valid_json",
            value=1.0 if validation.status == "PASS" else 0.0,
            data_type="BOOLEAN"
        )

    # Auto-evaluate with LLM judge
    with abv.start_as_current_observation(as_type="evaluator", name="auto-judge") as judge:
        judge_response = abv.gateway.complete_chat(
            provider="openai",
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Rate this response 1-10 for helpfulness.
                Query: {query}
                Response: {output}
                Return JSON: {{"score": N, "reason": "..."}}"""
            }],
            temperature=0
        )
        scores = json.loads(judge_response.choices[0].message.content)
        judge.update(output=scores)

        abv.score_current_trace(
            name="helpfulness",
            value=scores["score"] / 10,
            data_type="NUMERIC",
            comment=scores["reason"]
        )

    return {"response": output, "helpfulness": scores["score"]}

Session-Based Conversation with Memory

Track multi-turn conversations with session context.
from abvdev import ABV, observe
from typing import List, Dict

abv = ABV(api_key="sk-abv-...")

# In-memory store (use Redis/DB in production)
conversation_store: Dict[str, List[Dict]] = {}

@observe()
def chat_with_memory(user_id: str, session_id: str, message: str) -> str:
    abv.update_current_trace(
        user_id=user_id,
        session_id=session_id,
        metadata={"conversation_turn": len(conversation_store.get(session_id, [])) + 1}
    )

    # Get or create conversation history
    if session_id not in conversation_store:
        conversation_store[session_id] = [
            {"role": "system", "content": "You are a helpful assistant. Be concise."}
        ]

    history = conversation_store[session_id]

    # Add user message
    history.append({"role": "user", "content": message})

    # Generate response
    with abv.start_as_current_generation(name="chat-turn", model="gpt-4o-mini") as gen:
        response = abv.gateway.complete_chat(
            provider="openai",
            model="gpt-4o-mini",
            messages=history
        )
        assistant_message = response.choices[0].message.content

        gen.update(
            input=history,
            output=assistant_message,
            metadata={"history_length": len(history)}
        )

    # Store assistant response
    history.append({"role": "assistant", "content": assistant_message})

    # Score conversation metrics
    abv.score_current_trace(
        name="conversation_length",
        value=len(history),
        data_type="NUMERIC"
    )

    return assistant_message

# Multi-turn conversation
chat_with_memory("user-1", "session-abc", "Hi, I'm learning Python")
chat_with_memory("user-1", "session-abc", "What's a good first project?")
chat_with_memory("user-1", "session-abc", "Can you give me more details?")

Quality Gate Pipeline

Block deployment if evaluation scores are below threshold.
from abvdev import ABV, observe
from typing import List

abv = ABV(api_key="sk-abv-...")

@observe()
def quality_gate_evaluation(
    dataset_name: str,
    min_accuracy: float = 0.8,
    min_helpfulness: float = 0.7
) -> dict:
    """Run evaluation and determine if quality gate passes."""

    dataset = abv.get_dataset(dataset_name)
    scores = {"accuracy": [], "helpfulness": []}

    for item in dataset.items:
        with item.run(run_name="quality-gate", run_description="Pre-deploy check") as span:
            # Generate response
            response = abv.gateway.complete_chat(
                provider="openai",
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": item.input["query"]}]
            )
            output = response.choices[0].message.content

            # Check accuracy (exact match or semantic similarity)
            accuracy = 1.0 if item.expected_output.lower() in output.lower() else 0.0
            scores["accuracy"].append(accuracy)
            span.score(name="accuracy", value=accuracy, data_type="NUMERIC")

            # LLM judge for helpfulness
            judge_resp = abv.gateway.complete_chat(
                provider="openai",
                model="gpt-4o",
                messages=[{
                    "role": "user",
                    "content": f"Rate 0-1: Is this helpful? Query: {item.input['query']} Response: {output}. Reply with just a number."
                }],
                temperature=0
            )
            helpfulness = float(judge_resp.choices[0].message.content.strip())
            scores["helpfulness"].append(helpfulness)
            span.score(name="helpfulness", value=helpfulness, data_type="NUMERIC")

    # Calculate averages
    avg_accuracy = sum(scores["accuracy"]) / len(scores["accuracy"])
    avg_helpfulness = sum(scores["helpfulness"]) / len(scores["helpfulness"])

    # Determine gate status
    gate_passed = avg_accuracy >= min_accuracy and avg_helpfulness >= min_helpfulness

    # Score the overall run
    abv.score_current_trace(name="avg_accuracy", value=avg_accuracy, data_type="NUMERIC")
    abv.score_current_trace(name="avg_helpfulness", value=avg_helpfulness, data_type="NUMERIC")
    abv.score_current_trace(name="gate_passed", value=1.0 if gate_passed else 0.0, data_type="BOOLEAN")

    return {
        "gate_passed": gate_passed,
        "avg_accuracy": avg_accuracy,
        "avg_helpfulness": avg_helpfulness,
        "thresholds": {"accuracy": min_accuracy, "helpfulness": min_helpfulness}
    }

# Run in CI/CD
result = quality_gate_evaluation("production-test-cases", min_accuracy=0.85)
if not result["gate_passed"]:
    raise Exception(f"Quality gate failed: {result}")