Getting started with Evaluation - Opik

Opik provides two approaches to evaluation. Choose the one that fits your use case:

Test Suites: Define assertions in natural language and let an LLM judge test them. Best for pass/fail behavioral testing.
Datasets & Metrics: Score outputs against a dataset using quantitative metrics. Best for measuring quality across many traces.

Quick start

<Tabs> <Tab title="Test Suites"> Test Suites let you define expected behaviors as natural-language assertions and run them against your agent. An LLM judge checks each assertion automatically.

<CodeBlocks>
  ```python title="Python"
  import opik
  from openai import OpenAI
  from opik.integrations.openai import track_openai

  openai_client = track_openai(OpenAI())
  opik_client = opik.Opik()

  # Create a suite with assertions
  suite = opik_client.get_or_create_test_suite(
      name="my-agent-tests",
      project_name="my-agent",
      global_assertions=[
          "The response directly addresses the user's question",
          "The response is concise (3 sentences or fewer)",
      ],
      global_execution_policy={"runs_per_item": 2, "pass_threshold": 2},
  )

  # Add test cases
  suite.insert([
      {"data": {"question": "How do I create a new project?", "context": "Go to Dashboard and click 'New Project'."}},
      {"data": {"question": "What are the pricing tiers?", "context": "Free ($0/month), Pro ($29/month), Enterprise (custom)."}},
  ])

  # Define the task
  def task(item):
      response = openai_client.chat.completions.create(
          model="gpt-4o-mini",
          messages=[
              {"role": "system", "content": "Answer based ONLY on the provided context."},
              {"role": "user", "content": f"Question: {item['question']}\n\nContext:\n{item['context']}"},
          ],
      )
      return {"input": item, "output": response.choices[0].message.content}

  # Run the evaluation
  result = opik.run_tests(test_suite=suite, task=task)
  print(f"Pass rate: {result.pass_rate:.0%}")
  ```

  ```ts title="Typescript"
  import { Opik, TestSuite, runTests } from "opik";
  import OpenAI from "openai";

  const client = new Opik();
  const openai = new OpenAI();

  // Create a suite with assertions
  const suite = await TestSuite.getOrCreate(client, {
      name: "my-agent-tests",
      projectName: "my-agent",
      globalAssertions: [
          "The response directly addresses the user's question",
          "The response is concise (3 sentences or fewer)",
      ],
      globalExecutionPolicy: { runsPerItem: 2, passThreshold: 2 },
  });

  // Add test cases
  await suite.insert([
      { data: { question: "How do I create a new project?", context: "Go to Dashboard and click 'New Project'." } },
      { data: { question: "What are the pricing tiers?", context: "Free ($0/month), Pro ($29/month), Enterprise (custom)." } },
  ]);

  // Define the task
  const task = async (item: Record<string, string>) => {
      const response = await openai.chat.completions.create({
          model: "gpt-4o-mini",
          messages: [
              { role: "system", content: "Answer based ONLY on the provided context." },
              { role: "user", content: `Question: ${item.question}\n\nContext:\n${item.context}` },
          ],
      });
      return { input: item, output: response.choices[0].message.content };
  };

  // Run the evaluation
  const result = await runTests({ testSuite: suite, task });
  console.log(`Pass rate: ${((result.passRate ?? 0) * 100).toFixed(0)}%`);
  ```
</CodeBlocks>

Each run creates an experiment in the Opik dashboard for easy comparison.

<Frame>
  
</Frame>

See the [Building Test Suites](/evaluation/advanced/building-test-suites) guide for the full walkthrough.

</Tab> <Tab title="Datasets & Metrics"> Dataset-based evaluation scores your agent's outputs using quantitative metrics like hallucination detection, answer relevance, or custom scoring functions.

<CodeBlocks>
  ```python title="Python"
  import opik
  from opik.evaluation import evaluate
  from opik.evaluation.metrics import Hallucination

  opik.configure()
  client = opik.Opik()

  # Create a dataset
  dataset = client.get_or_create_dataset(name="my-eval-dataset")
  dataset.insert([
      {"input": "What is the capital of France?", "expected_output": "Paris"},
      {"input": "What is 2+2?", "expected_output": "4"},
  ])

  # Define the task
  def task(item):
      # Your LLM call here
      result = call_llm(item["input"])
      return {"output": result}

  # Run evaluation with metrics
  evaluate(
      dataset=dataset,
      task=task,
      scoring_metrics=[Hallucination()],
      experiment_name="my-experiment-v1",
  )
  ```

  ```ts title="Typescript"
  import { Opik } from "opik";

  const client = new Opik();

  // Create a dataset
  const dataset = await client.getOrCreateDataset({ name: "my-eval-dataset" });
  await dataset.insert([
      { input: "What is the capital of France?", expectedOutput: "Paris" },
      { input: "What is 2+2?", expectedOutput: "4" },
  ]);

  // Run evaluation with metrics
  await client.evaluate({
      dataset,
      task: async (item) => {
          const result = await callLlm(item.input);
          return { output: result };
      },
      experimentName: "my-experiment-v1",
  });
  ```
</CodeBlocks>

See the [Datasets & Experiments](/evaluation/advanced/evaluate_your_llm) guide for the full walkthrough
and the [Metrics](/evaluation/metrics/overview) section for all available metrics.

</Tab> </Tabs>