examples/mmlu.ipynb
This notebook shows how to:
We use the evals.elsuite.basic.match:Match Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result.
# Install, and download MMLU if you haven't already
%pip install -e ../.
!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar
!tar -xf data.tar
data_path = "data"
import pandas as pd
import os
# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "../evals/registry")
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models
choices = ["A", "B", "C", "D"]
sys_msg = "The following are multiple choice questions (with answers) about {}."
def create_chat_prompt(sys_msg, question, answers, subject):
user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
return [
{"role": "system", "content": sys_msg.format(subject)},
{"role": "user", "content": user_prompt}
]
def create_chat_example(question, answers, correct_answer):
"""
Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
"""
user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
return [
{"role": "system", "content": user_prompt, "name": "example_user"},
{"role": "system", "content": correct_answer, "name": "example_assistant"},
]
import yaml
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(data_path, "test")) if "_test.csv" in f])
registry_yaml = {}
for subject in subjects:
subject_path = os.path.join(registry_path, "data", "mmlu", subject)
os.makedirs(subject_path, exist_ok=True)
# Create few-shot prompts
dev_df = pd.read_csv(os.path.join(data_path, "dev", subject + "_dev.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
dev_df["sample"] = dev_df.apply(lambda x: create_chat_example(x["Question"], x[["A", "B", "C", "D"]], x["Answer"]), axis=1)
few_shot_path = os.path.join(subject_path, "few_shot.jsonl")
dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")
# Create test prompts and ideal completions
test_df = pd.read_csv(os.path.join(data_path, "test", subject + "_test.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
test_df["input"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x["Question"], x[["A", "B", "C", "D"]], subject), axis=1)
test_df["ideal"] = test_df.Answer
samples_path = os.path.join(subject_path, "samples.jsonl")
test_df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")
eval_id = f"match_mmlu_{subject}"
registry_yaml[eval_id] = {
"id": f"{eval_id}.test.v1",
"metrics": ["accuracy"]
}
registry_yaml[f"{eval_id}.test.v1"] = {
"class": "evals.elsuite.basic.match:Match",
"args": {
"samples_jsonl": samples_path,
"few_shot_jsonl": few_shot_path,
"num_few_shot": 4,
}
}
with open(os.path.join(registry_path, "evals", "mmlu.yaml"), "w") as f:
yaml.dump(registry_yaml, f)
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo match_mmlu_anatomy
# How to process the log events generated by oaieval
events = "/tmp/evallogs/{log_name}"
with open(events, "r") as f:
events_df = pd.read_json(f, lines=True)
matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="Correctness", ylabel="Count")
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
print(f"Prompt: {r.prompt}")
print(f"Sampled: {r.sampled}")
print("-" * 25)