apps/opik-documentation/documentation/docs/cookbook/gretel_opik_integration_cookbook.ipynb
The Story: You need high-quality Q&A datasets to evaluate your AI models, but creating them manually is time-consuming and expensive. This cookbook shows you how to use Gretel's synthetic data generation to create diverse, realistic Q&A datasets and import them into Opik for model evaluation and optimization.
What you'll accomplish:
Let's get started! ๐ฏ
This cookbook demonstrates two methods for generating synthetic data with Gretel:
We'll start with Data Designer, then show Safe Synthetics as an alternative.
We'll install the Gretel client and Opik SDK:
%pip install gretel-client opik pandas --upgrade --quiet
Let's authenticate with both Gretel and Opik:
import os
import getpass
import opik
import pandas as pd
print("๐ Setting up authentication...")
# Set up Gretel API key
if "GRETEL_API_KEY" not in os.environ:
os.environ["GRETEL_API_KEY"] = getpass.getpass("Enter your Gretel API key: ")
# Set up Opik (will prompt for API key if not configured)
opik.configure()
print("โ
Authentication completed!")
Now we'll use Gretel Data Designer to generate synthetic Q&A data. We'll create questions and answers about AI and machine learning:
from gretel_client.navigator_client import Gretel # Use navigator_client instead!
from gretel_client.data_designer import columns as C
from gretel_client.data_designer import params as P
print("๐ค Setting up Q&A dataset generation with Gretel Data Designer...")
# Initialize Data Designer using the navigator_client and factory method
gretel_navigator = Gretel() # This creates the navigator client
dd = gretel_navigator.data_designer.new(model_suite="apache-2.0")
# Add topic column (categorical sampler)
dd.add_column(
C.SamplerColumn(
name="topic",
type=P.SamplerType.CATEGORY,
params=P.CategorySamplerParams(
values=[
"neural networks", "deep learning", "machine learning", "NLP",
"computer vision", "reinforcement learning", "AI ethics", "data science"
]
)
)
)
# Add difficulty column
dd.add_column(
C.SamplerColumn(
name="difficulty",
type=P.SamplerType.CATEGORY,
params=P.CategorySamplerParams(
values=["beginner", "intermediate", "advanced"]
)
)
)
# Add question column (LLM-generated)
dd.add_column(
C.LLMTextColumn(
name="question",
prompt=(
"Generate a challenging, specific question about {{ topic }} "
"at {{ difficulty }} level. The question should be clear, focused, "
"and something a student or practitioner might actually ask."
)
)
)
# Add answer column (LLM-generated)
dd.add_column(
C.LLMTextColumn(
name="answer",
prompt=(
"Provide a clear, accurate, and comprehensive answer to this {{ difficulty }}-level "
"question about {{ topic }}: '{{ question }}'. The answer should be educational "
"and directly address all aspects of the question."
)
)
)
print("๐ Generating Q&A dataset...")
# Generate the dataset
workflow_run = dd.create(num_records=20, wait_until_done=True)
synthetic_df = workflow_run.dataset.df
print(f"โ
Generated {len(synthetic_df)} Q&A pairs!")
print(f"\n๐ Dataset shape: {synthetic_df.shape}")
print(f"๐ Columns: {list(synthetic_df.columns)}")
# Display first few rows
print("\n๐ Sample data:")
synthetic_df.head(3)
Let's convert our Gretel-generated data to the format Opik expects:
def convert_to_opik_format(df):
"""Convert Gretel Q&A data to Opik dataset format"""
opik_items = []
for _, row in df.iterrows():
# Create Opik dataset item
item = {
"input": {
"question": row["question"]
},
"expected_output": row["answer"],
"metadata": {
"topic": row.get("topic", "AI/ML"),
"difficulty": row.get("difficulty", "unknown"),
"source": "gretel_navigator"
}
}
opik_items.append(item)
return opik_items
print("๐ Converting to Opik format...")
opik_data = convert_to_opik_format(synthetic_df)
print(f"โ
Converted {len(opik_data)} items to Opik format!")
print("\n๐ Sample converted item:")
import json
print(json.dumps(opik_data[0], indent=2))
Now let's upload our dataset to Opik where it can be used for model evaluation:
print("๐ค Pushing dataset to Opik...")
# Initialize Opik client
opik_client = opik.Opik()
# Create the dataset
dataset_name = "gretel-ai-qa-dataset"
dataset = opik_client.get_or_create_dataset(
name=dataset_name,
description="Synthetic Q&A dataset generated using Gretel Data Designer for AI/ML evaluation"
)
# Insert the data
dataset.insert(opik_data)
print(f"โ
Successfully created dataset: {dataset.name}")
print(f"๐ Dataset ID: {dataset.id}")
print(f"๐ Total items: {len(opik_data)}")
The trace can now be viewed in the UI:
Let's confirm the dataset was created successfully and see how to use it:
print("๐ Verifying dataset creation...")
# Try to retrieve the dataset
try:
retrieved_dataset = opik_client.get_dataset(dataset_name)
print(f"โ
Dataset verified: {retrieved_dataset.name}")
print(f"๐ Dataset ID: {retrieved_dataset.id}")
print(f"\n๐ฏ Next steps:")
print(f"1. Go to https://www.comet.com")
print(f"2. Navigate to Opik โ Datasets")
print(f"3. Find your dataset: {dataset_name}")
print(f"4. Use it to evaluate your AI models!")
except Exception as e:
print(f"โ Could not verify dataset: {e}")
print("Please check your Opik configuration and try again.")
Here's how you can use your new dataset to evaluate a model with Opik:
# Example: Simple Q&A model evaluation
@opik.track
def simple_qa_model(input_data):
"""A simple example model that generates responses to questions"""
question = input_data.get('question', '')
# This is just an example - replace with your actual model
if 'neural network' in question.lower():
return "A neural network is a computational model inspired by biological neural networks."
elif 'machine learning' in question.lower():
return "Machine learning is a subset of AI that enables systems to learn from data."
else:
return "This is a complex AI/ML topic that requires detailed explanation."
print("๐งช Example model evaluation setup:")
print(f"Dataset: {dataset_name}")
print("Model: simple_qa_model (replace with your actual model)")
print("\n๐ก To run evaluation, uncomment and run the following code:")
print("\n๐ Integration complete! Your Gretel-generated dataset is ready for model evaluation in Opik.")
Congratulations! ๐ You've successfully:
The key advantage of using Gretel Data Designer is its modular approach - you can define exactly what data you want using samplers (for categories) and LLM columns (for generated text), giving you precise control over your synthetic dataset.
Happy evaluating! ๐
If you have an existing Q&A dataset and want to create a synthetic version, you can use Gretel Safe Synthetics instead:
%%capture
%pip install -U gretel-client
import pandas as pd
from gretel_client.navigator_client import Gretel
# Initialize Gretel client
gretel = Gretel(api_key="prompt")
# Option 1: Use Gretel's sample ecommerce dataset (has 200+ records)
my_data_source = "https://gretel-datasets.s3.us-west-2.amazonaws.com/ecommerce_customers.csv"
# Option 2: Create your own Q&A dataset (needs 200+ records for holdout)
# For demonstration, we'll create a larger dataset
sample_questions = [
'What is machine learning?',
'How do neural networks work?',
'What is the difference between AI and ML?',
'Explain deep learning concepts',
'What are the applications of NLP?'
] * 50 # Repeat to get 250 records
sample_answers = [
'Machine learning is a subset of AI that enables systems to learn from data.',
'Neural networks are computational models inspired by biological neural networks.',
'AI is the broader concept while ML is a specific approach to achieve AI.',
'Deep learning uses multi-layer neural networks to model complex patterns.',
'NLP applications include chatbots, translation, sentiment analysis, and text generation.'
] * 50 # Repeat to get 250 records
sample_data = {
'question': sample_questions,
'answer': sample_answers,
'topic': (['ML', 'Neural Networks', 'AI/ML', 'Deep Learning', 'NLP'] * 50),
'difficulty': (['beginner', 'intermediate', 'beginner', 'advanced', 'intermediate'] * 50)
}
original_df = pd.DataFrame(sample_data)
print(f"๐ Original dataset: {len(original_df)} records")
print(original_df.head())
# Important: Gretel requires at least 200 records to use holdout
if len(original_df) < 200:
print("โ ๏ธ Warning: Dataset has less than 200 records. Holdout will be disabled.")
# For quick demo with small dataset - disable holdout and transform
synthetic_dataset = gretel.safe_synthetic_dataset \
.from_data_source(original_df, holdout=None) \
.synthesize(num_records=5) \
.create()
# Wait for completion and get results
synthetic_dataset.wait_until_done()
synthetic_df_safe = synthetic_dataset.dataset.df
print(f"โ
Generated {len(synthetic_df_safe)} synthetic Q&A pairs using Safe Synthetics!")
print(synthetic_df_safe.head())
# Preview synthetic data
print("๐ Synthetic dataset preview:")
print(synthetic_dataset.dataset.df.head())
# View quality report table
print("๐ Quality Report Summary:")
print(synthetic_dataset.report.table)
# View detailed HTML report in notebook
# synthetic_dataset.report.display_in_notebook()
# Access workflow details
print("\n๐ง Workflow Configuration:")
print(synthetic_dataset.config_yaml)
# List all workflow steps
print("\n๐ Workflow Steps:")
for step in synthetic_dataset.steps:
print(f"- {step.name}")
def convert_to_opik_format(df):
"""Convert Gretel Q&A data to Opik dataset format"""
opik_items = []
for _, row in df.iterrows():
# Create Opik dataset item
item = {
"input": {
"question": row["question"]
},
"expected_output": row["answer"],
"metadata": {
"topic": row.get("topic", "AI/ML"),
"difficulty": row.get("difficulty", "unknown"),
"source": "gretel_navigator"
}
}
opik_items.append(item)
return opik_items
# Initialize Opik client if not already defined
opik_client = opik.Opik()
# Convert and upload to Opik (same process as before)
opik_data_safe = convert_to_opik_format(synthetic_df_safe)
# Create dataset in Opik
dataset_safe = opik_client.get_or_create_dataset(
name="gretel-safe-synthetics-qa-dataset",
description="Synthetic Q&A dataset generated using Gretel Safe Synthetics"
)
dataset_safe.insert(opik_data_safe)
print(f"โ
Safe Synthetics dataset created: {dataset_safe.name}")
The trace can now be viewed in the UI:
| Dataset Size | Holdout Setting | Example |
|---|---|---|
| < 200 records | holdout=None | from_data_source(df, holdout=None) |
| 200+ records | Default (5%) or custom | from_data_source(df) or from_data_source(df, holdout=0.1) |
| Large datasets | Custom percentage/count | from_data_source(df, holdout=250) |
| Use Case | Recommended Approach | Why |
|---|---|---|
| Creating new datasets from scratch | Data Designer | More control, custom column types, guided generation |
| Synthesizing existing datasets | Safe Synthetics | Preserves statistical relationships, privacy-safe |
| Custom data structures | Data Designer | Flexible column definitions, template system |
| Production data replication | Safe Synthetics | Maintains data utility while ensuring privacy |
Both approaches integrate seamlessly with Opik for model evaluation! ๐ฏ