apps/opik-documentation/documentation/docs/cookbook/huggingface_datasets_integration.ipynb
You will need:
This example will use:
This pip-install takes about a minute.
%pip install opik datasets transformers pandas tqdm huggingface_hub --upgrade
This step configures the Opik library for your session. It will prompt for your Comet API key if not already set in your environment or through Opik's configuration.
import opik
opik.configure()
For this example, we'll primarily use Hugging Face public datasets, but if you need private datasets, you can set your Hugging Face token:
import os
import getpass
# Uncomment if you need to access private datasets
# if "HUGGINGFACE_TOKEN" not in os.environ:
# os.environ["HUGGINGFACE_TOKEN"] = getpass.getpass("Enter your Hugging Face token: ")
import os
import pandas as pd
from datasets import load_dataset, Dataset as HFDataset
from opik import Opik
from typing import Dict, Any, Optional, List
import json
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings('ignore')
class HuggingFaceToOpikConverter:
"""Utility class to convert Hugging Face datasets to Opik format."""
def __init__(self, opik_client: Opik):
self.opik_client = opik_client
def load_hf_dataset(
self,
dataset_name: str,
split: Optional[str] = None,
config: Optional[str] = None,
subset_size: Optional[int] = None,
**kwargs
) -> HFDataset:
"""
Load a dataset from Hugging Face Hub.
Args:
dataset_name: Name of the dataset on HF Hub
split: Specific split to load (train, validation, test)
config: Configuration/subset of the dataset
subset_size: Limit the number of samples
**kwargs: Additional arguments for load_dataset
Returns:
Loaded Hugging Face dataset
"""
print(f"๐ฅ Loading dataset: {dataset_name}")
if config:
print(f" Config: {config}")
if split:
print(f" Split: {split}")
# Load the dataset
dataset = load_dataset(
dataset_name,
name=config,
split=split,
**kwargs
)
# Limit dataset size if specified
if subset_size and len(dataset) > subset_size:
dataset = dataset.select(range(subset_size))
print(f" Limited to {subset_size} samples")
print(f" โ
Loaded {len(dataset)} samples")
print(f" Features: {list(dataset.features.keys())}")
return dataset
def _extract_field_value(self, row, column_name):
"""Extract and convert field value to string if needed."""
value = row[column_name]
# Handle different data types
try:
if isinstance(value, (list, dict)):
# Convert numpy arrays to regular Python objects before JSON serialization
if isinstance(value, dict):
value = self._convert_numpy_to_python(value)
elif isinstance(value, list):
value = [self._convert_numpy_to_python(item) for item in value]
return json.dumps(value)
elif isinstance(value, np.ndarray):
# Convert numpy array to list then to JSON
return json.dumps(value.tolist())
elif pd.isna(value):
return ""
else:
return str(value)
except (TypeError, ValueError) as e:
# If JSON serialization fails, convert to string representation
print(f"Warning: Could not serialize value for column '{column_name}': {e}")
return str(value)
def _convert_numpy_to_python(self, obj):
"""Recursively convert numpy objects to Python native types."""
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, dict):
return {key: self._convert_numpy_to_python(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [self._convert_numpy_to_python(item) for item in obj]
else:
return obj
def convert_to_opik_format(
self,
hf_dataset: HFDataset,
input_column: str,
output_column: Optional[str] = None,
expected_output_column: Optional[str] = None,
metadata_columns: Optional[List[str]] = None,
custom_mapping: Optional[Dict[str, str]] = None
) -> List[Dict[str, Any]]:
"""
Convert HF dataset to Opik-compatible format.
Args:
hf_dataset: Hugging Face dataset
input_column: Column to use as input
output_column: Column to use as output (optional)
expected_output_column: Column to use as expected output (optional)
metadata_columns: Additional columns to include as metadata
custom_mapping: Custom mapping of HF columns to Opik fields
Returns:
List of Opik dataset items
"""
opik_items = []
metadata_columns = metadata_columns or []
# Convert to pandas for easier manipulation
df = hf_dataset.to_pandas()
print(f"๐ Converting {len(df)} samples to Opik format...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Converting"):
# Build Opik item
opik_item = {
"input": self._extract_field_value(row, input_column)
}
# Add output if specified
if output_column and output_column in df.columns:
opik_item["output"] = self._extract_field_value(row, output_column)
# Add expected output if specified
if expected_output_column and expected_output_column in df.columns:
opik_item["expected_output"] = self._extract_field_value(row, expected_output_column)
# Add metadata columns
if metadata_columns:
metadata = {}
for col in metadata_columns:
if col in df.columns:
metadata[col] = self._extract_field_value(row, col)
if metadata:
opik_item["metadata"] = metadata
# Apply custom mapping
if custom_mapping:
for hf_col, opik_field in custom_mapping.items():
if hf_col in df.columns:
opik_item[opik_field] = self._extract_field_value(row, hf_col)
opik_items.append(opik_item)
print(f" โ
Converted {len(opik_items)} items")
return opik_items
def create_opik_dataset(
self,
dataset_name: str,
opik_items: List[Dict[str, Any]],
description: Optional[str] = None
):
"""Create or update an Opik dataset with the converted items."""
print(f"๐ค Creating Opik dataset: {dataset_name}")
# Create or get existing dataset
dataset = self.opik_client.get_or_create_dataset(
name=dataset_name,
description=description or f"Dataset imported from Hugging Face"
)
# Insert items in batches
batch_size = 100
for i in tqdm(range(0, len(opik_items), batch_size), desc="Uploading batches"):
batch = opik_items[i:i + batch_size]
dataset.insert(batch)
print(f" โ
Successfully created dataset with {len(opik_items)} items")
return dataset
def import_dataset(
self,
hf_dataset_name: str,
opik_dataset_name: str,
input_column: str,
split: Optional[str] = "train",
config: Optional[str] = None,
output_column: Optional[str] = None,
expected_output_column: Optional[str] = None,
metadata_columns: Optional[List[str]] = None,
custom_mapping: Optional[Dict[str, str]] = None,
subset_size: Optional[int] = None,
description: Optional[str] = None
):
"""Complete pipeline to import HF dataset to Opik."""
print(f"๐ Starting import: {hf_dataset_name} โ {opik_dataset_name}")
# Load HF dataset
hf_dataset = self.load_hf_dataset(
hf_dataset_name,
split=split,
config=config,
subset_size=subset_size
)
# Convert to Opik format
opik_items = self.convert_to_opik_format(
hf_dataset,
input_column=input_column,
output_column=output_column,
expected_output_column=expected_output_column,
metadata_columns=metadata_columns,
custom_mapping=custom_mapping
)
# Create Opik dataset
dataset = self.create_opik_dataset(
opik_dataset_name,
opik_items,
description=description
)
print(f"๐ Import completed successfully!")
return dataset
# Initialize the converter
opik_client = Opik()
converter = HuggingFaceToOpikConverter(opik_client)
Let's start with a popular question-answering dataset:
print("๐ Example 1: Importing SQuAD Dataset")
print("=" * 50)
squad_dataset = converter.import_dataset(
hf_dataset_name="rajpurkar/squad",
opik_dataset_name="squad-qa-dataset",
input_column="question",
expected_output_column="answers",
split="train",
subset_size=1000, # Limit to 1000 samples for demo
metadata_columns=["context", "id"],
description="SQuAD question-answering dataset imported from Hugging Face"
)
print(f"Found {len(squad_dataset.to_pandas())} items in SQuAD dataset")
Now let's import a sentiment analysis dataset:
print("\n๐ฌ Example 2: Importing IMDB Movie Reviews")
print("=" * 50)
imdb_dataset = converter.import_dataset(
hf_dataset_name="stanfordnlp/imdb",
opik_dataset_name="imdb-sentiment-dataset",
input_column="text",
expected_output_column="label",
split="train",
subset_size=500, # Limit to 500 samples for demo
description="IMDB movie reviews sentiment dataset imported from Hugging Face"
)
Let's import a mathematical reasoning dataset:
print("\n๐ข Example 3: GSM8K Mathematical Reasoning Dataset")
print("=" * 50)
gsm8k_dataset = converter.import_dataset(
hf_dataset_name="openai/gsm8k",
opik_dataset_name="gsm8k-math-problems",
config="main",
input_column="question",
expected_output_column="answer",
split="train",
subset_size=200,
description="GSM8K mathematical reasoning dataset imported from Hugging Face"
)
Let's import a dataset for evaluating truthfulness:
print("\n๐ Example 4: TruthfulQA Dataset")
print("=" * 50)
try:
truthfulqa_dataset = converter.import_dataset(
hf_dataset_name="truthful_qa",
opik_dataset_name="truthfulqa-dataset",
config="generation",
input_column="question",
expected_output_column="best_answer",
split="validation",
subset_size=100,
metadata_columns=["category", "source"],
description="TruthfulQA dataset for evaluating truthfulness imported from Hugging Face"
)
except Exception as e:
print(f"Note: TruthfulQA import failed: {e}")
print("This might be due to dataset structure changes or access restrictions.")
Let's check what datasets we've successfully imported:
print("\nโ
Verification: Listing All Datasets")
print("=" * 50)
# List all datasets in Opik
datasets = opik_client.get_datasets()
print(f"Total datasets in Opik: {len(datasets)}")
for dataset in datasets:
print(f" ๐ {dataset.name}")
if hasattr(dataset, 'description') and dataset.description:
print(f" Description: {dataset.description}")
You can now go to the Opik app to see your imported datasets:
Here's how you can use the imported datasets for LLM evaluation:
print("\n๐งช Example: Using Imported Dataset for Evaluation")
print("=" * 50)
from opik.evaluation.metrics import LevenshteinRatio
# Example evaluation function
def dummy_llm_function(input_question: str) -> str:
"""
Dummy LLM function for demonstration.
In real use, this would call your actual LLM.
"""
return f"This is a dummy answer to: {input_question[:50]}..."
# Get one of our imported datasets
try:
squad_dataset = opik_client.get_dataset("squad-qa-dataset")
# Convert to pandas for easier handling
df = squad_dataset.to_pandas().head(3) # Just show first 3 for demo
print("Running sample evaluation...")
for idx, row in df.iterrows():
input_text = row['input']
expected = row.get('expected_output', 'No expected output')
predicted = dummy_llm_function(input_text)
print(f"\nSample {idx + 1}:")
print(f"Input: {input_text[:100]}...")
print(f"Expected: {str(expected)[:100]}...")
print(f"Predicted: {predicted}")
print("-" * 40)
except Exception as e:
print(f"Evaluation example failed: {e}")
print("Make sure you have successfully imported a dataset first.")
Here's a helper function to quickly import popular evaluation datasets:
def import_common_datasets(converter, subset_size=200):
"""Import several popular datasets for LLM evaluation."""
common_datasets = [
{
"hf_name": "ai2_arc",
"opik_name": "arc-reasoning-dataset",
"config": "ARC-Challenge",
"input_col": "question",
"expected_col": "answerKey",
"metadata_cols": ["choices"],
"description": "AI2 ARC reasoning dataset"
},
{
"hf_name": "winogrande",
"opik_name": "winogrande-commonsense-dataset",
"config": "winogrande_debiased",
"input_col": "sentence",
"expected_col": "answer",
"metadata_cols": ["option1", "option2"],
"description": "WinoGrande commonsense reasoning dataset"
},
{
"hf_name": "hellaswag",
"opik_name": "hellaswag-dataset",
"input_col": "ctx",
"expected_col": "label",
"metadata_cols": ["endings", "source_id"],
"description": "HellaSwag commonsense reasoning dataset"
}
]
for dataset_config in common_datasets:
try:
print(f"\n๐ฅ Importing {dataset_config['hf_name']}...")
converter.import_dataset(
hf_dataset_name=dataset_config["hf_name"],
opik_dataset_name=dataset_config["opik_name"],
config=dataset_config.get("config"),
input_column=dataset_config["input_col"],
expected_output_column=dataset_config["expected_col"],
metadata_columns=dataset_config.get("metadata_cols"),
subset_size=subset_size,
description=dataset_config["description"],
split="validation" # Use validation split for evaluation
)
except Exception as e:
print(f"โ Failed to import {dataset_config['hf_name']}: {str(e)}")
print(" This might be due to dataset structure changes or access restrictions.")
# Uncomment to import common datasets
# import_common_datasets(converter, subset_size=100)
For more complex datasets, you might need custom processing:
def import_custom_translation_dataset():
"""Example of importing a translation dataset with custom processing."""
print("\n๐ Advanced Example: Translation Dataset")
print("=" * 50)
try:
# Load a translation dataset
from datasets import load_dataset
# Example with a simple translation dataset
dataset = load_dataset("Helsinki-NLP/opus-100", "en-es", split="train")
# Custom processing for translation pairs
opik_items = []
for i, item in enumerate(dataset.select(range(100))): # Limit for demo
# Extract source and target from translation object
translation = item['translation']
opik_item = {
"input": translation['en'], # English source
"expected_output": translation['es'], # Spanish target
"metadata": {
"language_pair": "en-es",
"domain": "general",
"sample_id": i
}
}
opik_items.append(opik_item)
# Create Opik dataset
dataset = opik_client.get_or_create_dataset(
name="opus-translation-en-es",
description="OPUS-100 translation dataset (English to Spanish)"
)
# Insert items in batches
batch_size = 50
for i in tqdm(range(0, len(opik_items), batch_size), desc="Uploading"):
batch = opik_items[i:i + batch_size]
dataset.insert(batch)
print(f"โ
Successfully imported {len(opik_items)} translation pairs")
except Exception as e:
print(f"Translation dataset example failed: {e}")
print("This is expected as dataset structures can vary.")
# Uncomment to try the translation example
# import_custom_translation_dataset()
This section shows how to create a dataset programmatically similar to the synthetic data generation example:
def create_sample_qa_dataset():
"""Create a sample Q&A dataset similar to the synthetic data optimizer example."""
sample_items = [
{
"question": "What is the capital of France?",
"answer": "Paris",
"category": "geography",
"difficulty": "easy"
},
{
"question": "Who wrote Romeo and Juliet?",
"answer": "William Shakespeare",
"category": "literature",
"difficulty": "easy"
},
{
"question": "What is the largest planet in our solar system?",
"answer": "Jupiter",
"category": "science",
"difficulty": "medium"
},
{
"question": "In what year did World War II end?",
"answer": "1945",
"category": "history",
"difficulty": "medium"
},
{
"question": "What is the chemical symbol for gold?",
"answer": "Au",
"category": "science",
"difficulty": "hard"
}
]
dataset_name = "sample-qa-dataset"
dataset = opik_client.get_or_create_dataset(
name=dataset_name,
description="Sample Q&A dataset for demonstration"
)
dataset.insert(sample_items)
print(f"Opik Dataset '{dataset.name}' created with ID: {dataset.id}")
return dataset
# Create the sample dataset
sample_dataset = create_sample_qa_dataset()