%pip install -e ../. - Evals

Building a MAFAND Eval

This notebook shows how to:

Build and run an eval using the MAFAND dataset
Load the results and into a Pandas Dataframe

python

%load_ext autoreload
%autoreload 2

import os
import requests
import pandas as pd


# Install Evals if you haven't already
# %pip install -e ../.

python

# Download the MAFAND dataset

lang_pairs = [
    "en-amh", "en-hau", "en-ibo", "en-kin", "en-lug", "en-nya", "en-pcm", "en-sna", "en-swa", "en-tsn",
    "en-twi", "en-xho", "en-yor", "en-zul", "fr-bam", "fr-bbj", "fr-ewe", "fr-fon", "fr-mos", "fr-wol"
]

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
data_path = os.path.join(registry_path, "data", "lafand-mt")
os.makedirs(data_path, exist_ok=True)

for pair in lang_pairs:
    os.makedirs(os.path.join(data_path, pair), exist_ok=True)
    for dev_test in ['dev', 'test']:
        raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'
        with open(os.path.join(data_path, pair, f"{dev_test}.tsv"), "w", encoding="utf-8") as f:
            f.write(requests.get(raw_tsv_file).text)

python

# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

sys_msg = "Translate the text from {} to {}."
def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):
    return [
        {"role": "system", "content": sys_msg.format(input_lang, output_lang)}, 
        {"role": "user", "content": input_text}
    ]

def create_chat_example(input_text, correct_translation):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    return [
        {"role": "system", "content": input_text, "name": "example_user"},
        {"role": "system", "content": correct_translation, "name": "example_assistant"},
    ]

python

import yaml
import os

translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
output_path = os.path.join(registry_path, "data", "lafand-mt")

registry_yaml = {}

for input_path in translation_paths:
    langs = input_path.split("/")[-1]
    input_lang, output_lang = langs.split('-')
    pair_path = os.path.join(output_path, f"{input_lang}-{output_lang}")
    os.makedirs(pair_path, exist_ok=True)

    # Create few-shot prompts
    dev_df = pd.read_csv(os.path.join(input_path, "dev.tsv"), sep="\t")
    dev_df["sample"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)
    few_shot_path = os.path.join(pair_path, f"{input_lang}-{output_lang}_few_shot.jsonl")
    dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")

    # Create test prompts and ideal completions
    test_df = pd.read_csv(os.path.join(input_path, "test.tsv"), sep="\t")
    test_df["input"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))
    test_df["ideal"] = test_df[output_lang]
    
    samples_path = os.path.join(pair_path, f"{input_lang}-{output_lang}_samples.jsonl")
    test_df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")
    eval_id = f"mafand_translation_{input_lang}-{output_lang}"

    registry_yaml[eval_id] = {
        "id": f"{eval_id}.test.v1",
        "metrics": ["accuracy"]
    }
    registry_yaml[f"{eval_id}.test.v1"] = {
        "class": "evals.elsuite.translate:Translate",
        "args": {
            "samples_jsonl": samples_path,
            "few_shot_jsonl": few_shot_path,
            "num_few_shot": 4,
        }
    }

os.makedirs(os.path.join(registry_path, "evals"), exist_ok=True)
with open(os.path.join(registry_path, "evals", "mafand.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

python

# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20

python

# How to process the log events generated by oaieval

log_name = "EDIT THIS"  # copy from above
events = f"/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="sacrebleu score >30", ylabel="Count")

python

import matplotlib.pyplot as plt

# your list of scores
scores = matches_df['sacrebleu_sentence_score']

# define the threshold scores as a range from the minimum to the maximum score, in increments of 5
thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)

# count the number of scores above and below each threshold
above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]

# plot the counts as a step function
plt.step(thresholds, above_counts, label='number of samples withabove')

# set the x and y labels
plt.xlabel('sacrebleu threshold')
plt.ylabel('number of samples w/ score > threshold')

# show the plot
plt.show()

python

# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)