Back to Opik

Hugging Face Datasets to Opik Integration

apps/opik-documentation/documentation/docs/cookbook/huggingface_datasets_integration.ipynb

2.0.24-526220.2 KB
Original Source

Hugging Face Datasets to Opik Integration

You will need:

  1. A Comet account, for seeing Opik visualizations (free!) - comet.com
  2. Optional: A Hugging Face account for private datasets - huggingface.co/settings/tokens

This example will use:

  • datasets to load Hugging Face datasets
  • opik to create and manage Opik datasets

Setup

This pip-install takes about a minute.

python
%pip install opik datasets transformers pandas tqdm huggingface_hub --upgrade

This step configures the Opik library for your session. It will prompt for your Comet API key if not already set in your environment or through Opik's configuration.

python
import opik
opik.configure()

For this example, we'll primarily use Hugging Face public datasets, but if you need private datasets, you can set your Hugging Face token:

python
import os
import getpass
# Uncomment if you need to access private datasets
# if "HUGGINGFACE_TOKEN" not in os.environ:
#     os.environ["HUGGINGFACE_TOKEN"] = getpass.getpass("Enter your Hugging Face token: ")

Creating the HuggingFace to Opik Converter

python
import os
import pandas as pd
from datasets import load_dataset, Dataset as HFDataset
from opik import Opik
from typing import Dict, Any, Optional, List
import json
from tqdm import tqdm
import warnings
import numpy as np
warnings.filterwarnings('ignore')
python
class HuggingFaceToOpikConverter:
    """Utility class to convert Hugging Face datasets to Opik format."""
    
    def __init__(self, opik_client: Opik):
        self.opik_client = opik_client
    
    def load_hf_dataset(
        self, 
        dataset_name: str, 
        split: Optional[str] = None,
        config: Optional[str] = None,
        subset_size: Optional[int] = None,
        **kwargs
    ) -> HFDataset:
        """
        Load a dataset from Hugging Face Hub.
        
        Args:
            dataset_name: Name of the dataset on HF Hub
            split: Specific split to load (train, validation, test)
            config: Configuration/subset of the dataset
            subset_size: Limit the number of samples
            **kwargs: Additional arguments for load_dataset
        
        Returns:
            Loaded Hugging Face dataset
        """
        print(f"๐Ÿ“ฅ Loading dataset: {dataset_name}")
        if config:
            print(f"   Config: {config}")
        if split:
            print(f"   Split: {split}")
        
        # Load the dataset
        dataset = load_dataset(
            dataset_name, 
            name=config,
            split=split,
            **kwargs
        )
        
        # Limit dataset size if specified
        if subset_size and len(dataset) > subset_size:
            dataset = dataset.select(range(subset_size))
            print(f"   Limited to {subset_size} samples")
        
        print(f"   โœ… Loaded {len(dataset)} samples")
        print(f"   Features: {list(dataset.features.keys())}")
        
        return dataset
    
    def _extract_field_value(self, row, column_name):
        """Extract and convert field value to string if needed."""
        value = row[column_name]
        
        # Handle different data types
        try:
            if isinstance(value, (list, dict)):
                # Convert numpy arrays to regular Python objects before JSON serialization
                if isinstance(value, dict):
                    value = self._convert_numpy_to_python(value)
                elif isinstance(value, list):
                    value = [self._convert_numpy_to_python(item) for item in value]
                return json.dumps(value)
            elif isinstance(value, np.ndarray):
                # Convert numpy array to list then to JSON
                return json.dumps(value.tolist())
            elif pd.isna(value):
                return ""
            else:
                return str(value)
        except (TypeError, ValueError) as e:
            # If JSON serialization fails, convert to string representation
            print(f"Warning: Could not serialize value for column '{column_name}': {e}")
            return str(value)
    
    def _convert_numpy_to_python(self, obj):
        """Recursively convert numpy objects to Python native types."""
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.bool_):
            return bool(obj)
        elif isinstance(obj, dict):
            return {key: self._convert_numpy_to_python(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self._convert_numpy_to_python(item) for item in obj]
        else:
            return obj
    
    def convert_to_opik_format(
        self, 
        hf_dataset: HFDataset,
        input_column: str,
        output_column: Optional[str] = None,
        expected_output_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,
        custom_mapping: Optional[Dict[str, str]] = None
    ) -> List[Dict[str, Any]]:
        """
        Convert HF dataset to Opik-compatible format.
        
        Args:
            hf_dataset: Hugging Face dataset
            input_column: Column to use as input
            output_column: Column to use as output (optional)
            expected_output_column: Column to use as expected output (optional)
            metadata_columns: Additional columns to include as metadata
            custom_mapping: Custom mapping of HF columns to Opik fields
        
        Returns:
            List of Opik dataset items
        """
        opik_items = []
        metadata_columns = metadata_columns or []
        
        # Convert to pandas for easier manipulation
        df = hf_dataset.to_pandas()
        
        print(f"๐Ÿ”„ Converting {len(df)} samples to Opik format...")
        
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Converting"):
            # Build Opik item
            opik_item = {
                "input": self._extract_field_value(row, input_column)
            }
            
            # Add output if specified
            if output_column and output_column in df.columns:
                opik_item["output"] = self._extract_field_value(row, output_column)
            
            # Add expected output if specified
            if expected_output_column and expected_output_column in df.columns:
                opik_item["expected_output"] = self._extract_field_value(row, expected_output_column)
            
            # Add metadata columns
            if metadata_columns:
                metadata = {}
                for col in metadata_columns:
                    if col in df.columns:
                        metadata[col] = self._extract_field_value(row, col)
                if metadata:
                    opik_item["metadata"] = metadata
            
            # Apply custom mapping
            if custom_mapping:
                for hf_col, opik_field in custom_mapping.items():
                    if hf_col in df.columns:
                        opik_item[opik_field] = self._extract_field_value(row, hf_col)
            
            opik_items.append(opik_item)
        
        print(f"   โœ… Converted {len(opik_items)} items")
        return opik_items
    
    def create_opik_dataset(
        self,
        dataset_name: str,
        opik_items: List[Dict[str, Any]],
        description: Optional[str] = None
    ):
        """Create or update an Opik dataset with the converted items."""
        print(f"๐Ÿ“ค Creating Opik dataset: {dataset_name}")
        
        # Create or get existing dataset
        dataset = self.opik_client.get_or_create_dataset(
            name=dataset_name,
            description=description or f"Dataset imported from Hugging Face"
        )
        
        # Insert items in batches
        batch_size = 100
        
        for i in tqdm(range(0, len(opik_items), batch_size), desc="Uploading batches"):
            batch = opik_items[i:i + batch_size]
            dataset.insert(batch)
        
        print(f"   โœ… Successfully created dataset with {len(opik_items)} items")
        return dataset
    
    def import_dataset(
        self,
        hf_dataset_name: str,
        opik_dataset_name: str,
        input_column: str,
        split: Optional[str] = "train",
        config: Optional[str] = None,
        output_column: Optional[str] = None,
        expected_output_column: Optional[str] = None,
        metadata_columns: Optional[List[str]] = None,
        custom_mapping: Optional[Dict[str, str]] = None,
        subset_size: Optional[int] = None,
        description: Optional[str] = None
    ):
        """Complete pipeline to import HF dataset to Opik."""
        print(f"๐Ÿš€ Starting import: {hf_dataset_name} โ†’ {opik_dataset_name}")
        
        # Load HF dataset
        hf_dataset = self.load_hf_dataset(
            hf_dataset_name, 
            split=split, 
            config=config, 
            subset_size=subset_size
        )
        
        # Convert to Opik format
        opik_items = self.convert_to_opik_format(
            hf_dataset,
            input_column=input_column,
            output_column=output_column,
            expected_output_column=expected_output_column,
            metadata_columns=metadata_columns,
            custom_mapping=custom_mapping
        )
        
        # Create Opik dataset
        dataset = self.create_opik_dataset(
            opik_dataset_name,
            opik_items,
            description=description
        )
        
        print(f"๐ŸŽ‰ Import completed successfully!")
        return dataset

# Initialize the converter
opik_client = Opik()
converter = HuggingFaceToOpikConverter(opik_client)

Example 1: Importing SQuAD Dataset

Let's start with a popular question-answering dataset:

python
print("๐Ÿ“š Example 1: Importing SQuAD Dataset")
print("=" * 50)

squad_dataset = converter.import_dataset(
    hf_dataset_name="rajpurkar/squad",
    opik_dataset_name="squad-qa-dataset",
    input_column="question",
    expected_output_column="answers",
    split="train",
    subset_size=1000,  # Limit to 1000 samples for demo
    metadata_columns=["context", "id"],
    description="SQuAD question-answering dataset imported from Hugging Face"
)
python
print(f"Found {len(squad_dataset.to_pandas())} items in SQuAD dataset")

Example 2: Importing IMDB Movie Reviews

Now let's import a sentiment analysis dataset:

python
print("\n๐ŸŽฌ Example 2: Importing IMDB Movie Reviews")
print("=" * 50)

imdb_dataset = converter.import_dataset(
    hf_dataset_name="stanfordnlp/imdb",
    opik_dataset_name="imdb-sentiment-dataset",
    input_column="text",
    expected_output_column="label",
    split="train",
    subset_size=500,  # Limit to 500 samples for demo
    description="IMDB movie reviews sentiment dataset imported from Hugging Face"
)

Example 3: Importing GSM8K Math Problems

Let's import a mathematical reasoning dataset:

python
print("\n๐Ÿ”ข Example 3: GSM8K Mathematical Reasoning Dataset")
print("=" * 50)

gsm8k_dataset = converter.import_dataset(
    hf_dataset_name="openai/gsm8k",
    opik_dataset_name="gsm8k-math-problems",
    config="main",
    input_column="question",
    expected_output_column="answer",
    split="train",
    subset_size=200,
    description="GSM8K mathematical reasoning dataset imported from Hugging Face"
)

Example 4: Importing TruthfulQA Dataset

Let's import a dataset for evaluating truthfulness:

python
print("\n๐Ÿ” Example 4: TruthfulQA Dataset")
print("=" * 50)

try:
    truthfulqa_dataset = converter.import_dataset(
        hf_dataset_name="truthful_qa",
        opik_dataset_name="truthfulqa-dataset",
        config="generation",
        input_column="question",
        expected_output_column="best_answer",
        split="validation",
        subset_size=100,
        metadata_columns=["category", "source"],
        description="TruthfulQA dataset for evaluating truthfulness imported from Hugging Face"
    )
except Exception as e:
    print(f"Note: TruthfulQA import failed: {e}")
    print("This might be due to dataset structure changes or access restrictions.")

Verifying Your Imported Datasets

Let's check what datasets we've successfully imported:

python
print("\nโœ… Verification: Listing All Datasets")
print("=" * 50)

# List all datasets in Opik
datasets = opik_client.get_datasets()
print(f"Total datasets in Opik: {len(datasets)}")

for dataset in datasets:
    print(f"  ๐Ÿ“ {dataset.name}")
    if hasattr(dataset, 'description') and dataset.description:
        print(f"     Description: {dataset.description}")

You can now go to the Opik app to see your imported datasets:

Using Imported Datasets for Evaluation

Here's how you can use the imported datasets for LLM evaluation:

python
print("\n๐Ÿงช Example: Using Imported Dataset for Evaluation")
print("=" * 50)

from opik.evaluation.metrics import LevenshteinRatio

# Example evaluation function
def dummy_llm_function(input_question: str) -> str:
    """
    Dummy LLM function for demonstration.
    In real use, this would call your actual LLM.
    """
    return f"This is a dummy answer to: {input_question[:50]}..."

# Get one of our imported datasets
try:
    squad_dataset = opik_client.get_dataset("squad-qa-dataset")
    
    # Convert to pandas for easier handling
    df = squad_dataset.to_pandas().head(3)  # Just show first 3 for demo
    
    print("Running sample evaluation...")
    for idx, row in df.iterrows():
        input_text = row['input']
        expected = row.get('expected_output', 'No expected output')
        predicted = dummy_llm_function(input_text)
        
        print(f"\nSample {idx + 1}:")
        print(f"Input: {input_text[:100]}...")
        print(f"Expected: {str(expected)[:100]}...")
        print(f"Predicted: {predicted}")
        print("-" * 40)

except Exception as e:
    print(f"Evaluation example failed: {e}")
    print("Make sure you have successfully imported a dataset first.")

Helper Function for Common Datasets

Here's a helper function to quickly import popular evaluation datasets:

python
def import_common_datasets(converter, subset_size=200):
    """Import several popular datasets for LLM evaluation."""
    
    common_datasets = [
        {
            "hf_name": "ai2_arc",
            "opik_name": "arc-reasoning-dataset", 
            "config": "ARC-Challenge",
            "input_col": "question",
            "expected_col": "answerKey",
            "metadata_cols": ["choices"],
            "description": "AI2 ARC reasoning dataset"
        },
        {
            "hf_name": "winogrande",
            "opik_name": "winogrande-commonsense-dataset",
            "config": "winogrande_debiased",
            "input_col": "sentence",
            "expected_col": "answer",
            "metadata_cols": ["option1", "option2"],
            "description": "WinoGrande commonsense reasoning dataset"
        },
        {
            "hf_name": "hellaswag",
            "opik_name": "hellaswag-dataset",
            "input_col": "ctx",
            "expected_col": "label",
            "metadata_cols": ["endings", "source_id"],
            "description": "HellaSwag commonsense reasoning dataset"
        }
    ]
    
    for dataset_config in common_datasets:
        try:
            print(f"\n๐Ÿ“ฅ Importing {dataset_config['hf_name']}...")
            converter.import_dataset(
                hf_dataset_name=dataset_config["hf_name"],
                opik_dataset_name=dataset_config["opik_name"],
                config=dataset_config.get("config"),
                input_column=dataset_config["input_col"],
                expected_output_column=dataset_config["expected_col"],
                metadata_columns=dataset_config.get("metadata_cols"),
                subset_size=subset_size,
                description=dataset_config["description"],
                split="validation"  # Use validation split for evaluation
            )
        except Exception as e:
            print(f"โŒ Failed to import {dataset_config['hf_name']}: {str(e)}")
            print("   This might be due to dataset structure changes or access restrictions.")

# Uncomment to import common datasets
# import_common_datasets(converter, subset_size=100)

Advanced Usage: Custom Dataset Processing

For more complex datasets, you might need custom processing:

python
def import_custom_translation_dataset():
    """Example of importing a translation dataset with custom processing."""
    
    print("\n๐ŸŒ Advanced Example: Translation Dataset")
    print("=" * 50)
    
    try:
        # Load a translation dataset
        from datasets import load_dataset
        
        # Example with a simple translation dataset
        dataset = load_dataset("Helsinki-NLP/opus-100", "en-es", split="train")
        
        # Custom processing for translation pairs
        opik_items = []
        for i, item in enumerate(dataset.select(range(100))):  # Limit for demo
            # Extract source and target from translation object
            translation = item['translation']
            opik_item = {
                "input": translation['en'],  # English source
                "expected_output": translation['es'],  # Spanish target
                "metadata": {
                    "language_pair": "en-es",
                    "domain": "general",
                    "sample_id": i
                }
            }
            opik_items.append(opik_item)
        
        # Create Opik dataset
        dataset = opik_client.get_or_create_dataset(
            name="opus-translation-en-es",
            description="OPUS-100 translation dataset (English to Spanish)"
        )
        
        # Insert items in batches
        batch_size = 50
        for i in tqdm(range(0, len(opik_items), batch_size), desc="Uploading"):
            batch = opik_items[i:i + batch_size]
            dataset.insert(batch)
            
        print(f"โœ… Successfully imported {len(opik_items)} translation pairs")
        
    except Exception as e:
        print(f"Translation dataset example failed: {e}")
        print("This is expected as dataset structures can vary.")

# Uncomment to try the translation example
# import_custom_translation_dataset()

Store New Dataset in Opik

This section shows how to create a dataset programmatically similar to the synthetic data generation example:

python
def create_sample_qa_dataset():
    """Create a sample Q&A dataset similar to the synthetic data optimizer example."""
    
    sample_items = [
        {
            "question": "What is the capital of France?",
            "answer": "Paris",
            "category": "geography",
            "difficulty": "easy"
        },
        {
            "question": "Who wrote Romeo and Juliet?",
            "answer": "William Shakespeare", 
            "category": "literature",
            "difficulty": "easy"
        },
        {
            "question": "What is the largest planet in our solar system?",
            "answer": "Jupiter",
            "category": "science",
            "difficulty": "medium"
        },
        {
            "question": "In what year did World War II end?",
            "answer": "1945",
            "category": "history", 
            "difficulty": "medium"
        },
        {
            "question": "What is the chemical symbol for gold?",
            "answer": "Au",
            "category": "science",
            "difficulty": "hard"
        }
    ]
    
    dataset_name = "sample-qa-dataset"
    
    dataset = opik_client.get_or_create_dataset(
        name=dataset_name,
        description="Sample Q&A dataset for demonstration"
    )
    
    dataset.insert(sample_items)
    print(f"Opik Dataset '{dataset.name}' created with ID: {dataset.id}")
    
    return dataset

# Create the sample dataset
sample_dataset = create_sample_qa_dataset()