examples/lafand-mt.ipynb
This notebook shows how to:
%load_ext autoreload
%autoreload 2
import os
import requests
import pandas as pd
# Install Evals if you haven't already
# %pip install -e ../.
# Download the MAFAND dataset
lang_pairs = [
"en-amh", "en-hau", "en-ibo", "en-kin", "en-lug", "en-nya", "en-pcm", "en-sna", "en-swa", "en-tsn",
"en-twi", "en-xho", "en-yor", "en-zul", "fr-bam", "fr-bbj", "fr-ewe", "fr-fon", "fr-mos", "fr-wol"
]
# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
data_path = os.path.join(registry_path, "data", "lafand-mt")
os.makedirs(data_path, exist_ok=True)
for pair in lang_pairs:
os.makedirs(os.path.join(data_path, pair), exist_ok=True)
for dev_test in ['dev', 'test']:
raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'
with open(os.path.join(data_path, pair, f"{dev_test}.tsv"), "w", encoding="utf-8") as f:
f.write(requests.get(raw_tsv_file).text)
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models
sys_msg = "Translate the text from {} to {}."
def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):
return [
{"role": "system", "content": sys_msg.format(input_lang, output_lang)},
{"role": "user", "content": input_text}
]
def create_chat_example(input_text, correct_translation):
"""
Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
"""
return [
{"role": "system", "content": input_text, "name": "example_user"},
{"role": "system", "content": correct_translation, "name": "example_assistant"},
]
import yaml
import os
translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])
# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
output_path = os.path.join(registry_path, "data", "lafand-mt")
registry_yaml = {}
for input_path in translation_paths:
langs = input_path.split("/")[-1]
input_lang, output_lang = langs.split('-')
pair_path = os.path.join(output_path, f"{input_lang}-{output_lang}")
os.makedirs(pair_path, exist_ok=True)
# Create few-shot prompts
dev_df = pd.read_csv(os.path.join(input_path, "dev.tsv"), sep="\t")
dev_df["sample"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)
few_shot_path = os.path.join(pair_path, f"{input_lang}-{output_lang}_few_shot.jsonl")
dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")
# Create test prompts and ideal completions
test_df = pd.read_csv(os.path.join(input_path, "test.tsv"), sep="\t")
test_df["input"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))
test_df["ideal"] = test_df[output_lang]
samples_path = os.path.join(pair_path, f"{input_lang}-{output_lang}_samples.jsonl")
test_df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")
eval_id = f"mafand_translation_{input_lang}-{output_lang}"
registry_yaml[eval_id] = {
"id": f"{eval_id}.test.v1",
"metrics": ["accuracy"]
}
registry_yaml[f"{eval_id}.test.v1"] = {
"class": "evals.elsuite.translate:Translate",
"args": {
"samples_jsonl": samples_path,
"few_shot_jsonl": few_shot_path,
"num_few_shot": 4,
}
}
os.makedirs(os.path.join(registry_path, "evals"), exist_ok=True)
with open(os.path.join(registry_path, "evals", "mafand.yaml"), "w") as f:
yaml.dump(registry_yaml, f)
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20
# How to process the log events generated by oaieval
log_name = "EDIT THIS" # copy from above
events = f"/tmp/evallogs/{log_name}"
with open(events, "r") as f:
events_df = pd.read_json(f, lines=True)
matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="sacrebleu score >30", ylabel="Count")
import matplotlib.pyplot as plt
# your list of scores
scores = matches_df['sacrebleu_sentence_score']
# define the threshold scores as a range from the minimum to the maximum score, in increments of 5
thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)
# count the number of scores above and below each threshold
above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]
# plot the counts as a step function
plt.step(thresholds, above_counts, label='number of samples withabove')
# set the x and y labels
plt.xlabel('sacrebleu threshold')
plt.ylabel('number of samples w/ score > threshold')
# show the plot
plt.show()
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
print(f"Prompt: {r.prompt}")
print(f"Sampled: {r.sampled}")
print("-" * 25)