apps/opik-documentation/documentation/fern/docs-v2/evaluation/getting-started.mdx
Opik provides two approaches to evaluation. Choose the one that fits your use case:
<CodeBlocks>
```python title="Python"
import opik
from openai import OpenAI
from opik.integrations.openai import track_openai
openai_client = track_openai(OpenAI())
opik_client = opik.Opik()
# Create a suite with assertions
suite = opik_client.get_or_create_test_suite(
name="my-agent-tests",
project_name="my-agent",
global_assertions=[
"The response directly addresses the user's question",
"The response is concise (3 sentences or fewer)",
],
global_execution_policy={"runs_per_item": 2, "pass_threshold": 2},
)
# Add test cases
suite.insert([
{"data": {"question": "How do I create a new project?", "context": "Go to Dashboard and click 'New Project'."}},
{"data": {"question": "What are the pricing tiers?", "context": "Free ($0/month), Pro ($29/month), Enterprise (custom)."}},
])
# Define the task
def task(item):
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Answer based ONLY on the provided context."},
{"role": "user", "content": f"Question: {item['question']}\n\nContext:\n{item['context']}"},
],
)
return {"input": item, "output": response.choices[0].message.content}
# Run the evaluation
result = opik.run_tests(test_suite=suite, task=task)
print(f"Pass rate: {result.pass_rate:.0%}")
```
```ts title="Typescript"
import { Opik, TestSuite, runTests } from "opik";
import OpenAI from "openai";
const client = new Opik();
const openai = new OpenAI();
// Create a suite with assertions
const suite = await TestSuite.getOrCreate(client, {
name: "my-agent-tests",
projectName: "my-agent",
globalAssertions: [
"The response directly addresses the user's question",
"The response is concise (3 sentences or fewer)",
],
globalExecutionPolicy: { runsPerItem: 2, passThreshold: 2 },
});
// Add test cases
await suite.insert([
{ data: { question: "How do I create a new project?", context: "Go to Dashboard and click 'New Project'." } },
{ data: { question: "What are the pricing tiers?", context: "Free ($0/month), Pro ($29/month), Enterprise (custom)." } },
]);
// Define the task
const task = async (item: Record<string, string>) => {
const response = await openai.chat.completions.create({
model: "gpt-4o-mini",
messages: [
{ role: "system", content: "Answer based ONLY on the provided context." },
{ role: "user", content: `Question: ${item.question}\n\nContext:\n${item.context}` },
],
});
return { input: item, output: response.choices[0].message.content };
};
// Run the evaluation
const result = await runTests({ testSuite: suite, task });
console.log(`Pass rate: ${((result.passRate ?? 0) * 100).toFixed(0)}%`);
```
</CodeBlocks>
Each run creates an experiment in the Opik dashboard for easy comparison.
<Frame>
</Frame>
See the [Building Test Suites](/evaluation/advanced/building-test-suites) guide for the full walkthrough.
<CodeBlocks>
```python title="Python"
import opik
from opik.evaluation import evaluate
from opik.evaluation.metrics import Hallucination
opik.configure()
client = opik.Opik()
# Create a dataset
dataset = client.get_or_create_dataset(name="my-eval-dataset")
dataset.insert([
{"input": "What is the capital of France?", "expected_output": "Paris"},
{"input": "What is 2+2?", "expected_output": "4"},
])
# Define the task
def task(item):
# Your LLM call here
result = call_llm(item["input"])
return {"output": result}
# Run evaluation with metrics
evaluate(
dataset=dataset,
task=task,
scoring_metrics=[Hallucination()],
experiment_name="my-experiment-v1",
)
```
```ts title="Typescript"
import { Opik } from "opik";
const client = new Opik();
// Create a dataset
const dataset = await client.getOrCreateDataset({ name: "my-eval-dataset" });
await dataset.insert([
{ input: "What is the capital of France?", expectedOutput: "Paris" },
{ input: "What is 2+2?", expectedOutput: "4" },
]);
// Run evaluation with metrics
await client.evaluate({
dataset,
task: async (item) => {
const result = await callLlm(item.input);
return { output: result };
},
experimentName: "my-experiment-v1",
});
```
</CodeBlocks>
See the [Datasets & Experiments](/evaluation/advanced/evaluate_your_llm) guide for the full walkthrough
and the [Metrics](/evaluation/metrics/overview) section for all available metrics.