python

!pip install plotly kaleido datasets nbformat -U -q

python

import os

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "output"

python

eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)

1. Load all results

python

import glob


results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
    df = pd.read_json(f, lines=True)
    df["agent_name"] = f.split("/")[-1].split(".")[0]
    results.append(df)

result_df = pd.concat(results)
result_df["prediction"] = result_df["prediction"].fillna("No prediction")

python

import re
from collections import Counter

from scripts.gaia_scorer import check_close_call, question_scorer


result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
    lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
    axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
    matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]

    if len(matches) == 0:
        return "Not found"
    file_path = matches.values[0]

    if isinstance(file_path, str) and len(file_path) > 0:
        return file_path.split(".")[-1]
    else:
        return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
    regex = r"\b(\w+)\("
    function_calls = [el for el in re.findall(regex, code) if el.islower()]

    function_call_counter = Counter(function_calls)
    return function_call_counter


def sum_tool_calls(steps):
    total_count = Counter()
    for step in steps:
        if "llm_output" in step:
            total_count += extract_tool_calls(step["llm_output"])

    return total_count


def get_durations(row):
    # start_datetime = datetime.strptime(row['start_time'], "%Y-%m-%d %H:%M:%S")
    # end_datetime = datetime.strptime(row['end_time'], "%Y-%m-%d %H:%M:%S")

    duration_timedelta = row["end_time"] - row["start_time"]
    return int(duration_timedelta.total_seconds())


result_df["duration"] = result_df.apply(get_durations, axis=1)
# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

python

result_df["agent_name"].value_counts()

2. Inspect specific runs

python

sel_df = result_df
# sel_df = sel_df.loc[
#     (result_df["agent_name"].isin(list_versions))
# ]
sel_df = sel_df.reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)

python

display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
    sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question", "duration"]]
    .agg(
        {
            "is_correct": "mean",
            "is_near_correct": "mean",
            "count_steps": "mean",
            "question": "count",
            "duration": "mean",
        }
    )
    .rename(columns={"question": "count"})
)

python

import plotly.express as px


cumulative_df = (
    (
        sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
        .expanding(min_periods=1, axis=0, method="single")
        .agg({"is_correct": "mean", "is_near_correct": "count"})
        .reset_index()
    )
    .copy()
    .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
    try:
        res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
        return res
    except Exception:
        return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)

px.line(
    cumulative_df,
    color="agent_name",
    x="index",
    y="is_correct",
    hover_data="question",
)

3. Dive deeper into one run

python

sel_df = result_df.loc[result_df["agent_name"] == "o1"]
print(len(sel_df))

Count errors

python

import numpy as np


error_types = [
    "AgentParsingError",
    "AgentExecutionError",
    "AgentMaxIterationsError",
    "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
    if isinstance(row["intermediate_steps"], list):
        row["Count steps"] = len(row["intermediate_steps"])
        for step in row["intermediate_steps"]:
            if isinstance(step, dict) and "error" in step:
                try:
                    row[str(step["error"]["error_type"])] += 1
                except Exception:
                    pass
    return row


sel_df = sel_df.apply(count_errors, axis=1)

python

import plotly.express as px


aggregate_errors = (
    sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)

fig = px.bar(
    aggregate_errors,
    y="value",
    x="variable",
    color="is_correct",
    labels={
        "agent_name": "<b>Model</b>",
        "task": "<b>Level</b>",
        "aggregate_score": "<b>Performance</b>",
        "value": "<b>Average count</b>",
        "eval_score_GPT4": "<b>Score</b>",
    },
)
fig.update_layout(
    height=500,
    width=800,
    barmode="group",
    bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()

Inspect result by file extension type

python

display(
    result_df.groupby(["attachment_type"])[["is_correct", "count_steps", "question"]].agg(
        {"is_correct": "mean", "count_steps": "mean", "question": "count"}
    )
)

4. Ensembling methods

python

counts = result_df["agent_name"].value_counts()
long_series = result_df.loc[result_df["agent_name"].isin(counts[counts > 140].index)]

python

def majority_vote(df):
    df = df[(df["prediction"] != "Unable to determine") & (~df["prediction"].isna()) & (df["prediction"] != "None")]

    answer_modes = df.groupby("question")["prediction"].agg(lambda x: x.mode()[0]).reset_index()
    first_occurrences = (
        df.groupby(["question", "prediction"]).agg({"task": "first", "is_correct": "first"}).reset_index()
    )
    result = answer_modes.merge(first_occurrences, on=["question", "prediction"], how="left")

    return result


def oracle(df):
    def get_first_correct_or_first_wrong(group):
        correct_answers = group[group["is_correct"]]
        if len(correct_answers) > 0:
            return correct_answers.iloc[0]
        return group.iloc[0]

    result = df.groupby("question").apply(get_first_correct_or_first_wrong)

    return result.reset_index(drop=True)


display((long_series.groupby("agent_name")["is_correct"].mean() * 100).round(2))
print(f"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}")
print(f"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}")

Submit

python

agent_run = "code_o1_04_february_submission5.jsonl"
df = pd.read_json(f"output/validation/{agent_run}", lines=True)
df = df[["task_id", "prediction", "intermediate_steps"]]
df = df.rename(columns={"prediction": "model_answer", "intermediate_steps": "reasoning_trace"})

python

df.to_json("submission.jsonl", orient="records", lines=True)