examples/open_deep_research/analysis.ipynb
!pip install plotly kaleido datasets nbformat -U -q
import os
import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))
pd.set_option("max_colwidth", None)
OUTPUT_DIR = "output"
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)
import glob
results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
df = pd.read_json(f, lines=True)
df["agent_name"] = f.split("/")[-1].split(".")[0]
results.append(df)
result_df = pd.concat(results)
result_df["prediction"] = result_df["prediction"].fillna("No prediction")
import re
from collections import Counter
from scripts.gaia_scorer import check_close_call, question_scorer
result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
axis=1,
)
result_df["count_steps"] = result_df["intermediate_steps"].apply(len)
def find_attachment(question):
matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]
if len(matches) == 0:
return "Not found"
file_path = matches.values[0]
if isinstance(file_path, str) and len(file_path) > 0:
return file_path.split(".")[-1]
else:
return "None"
result_df["attachment_type"] = result_df["question"].apply(find_attachment)
def extract_tool_calls(code):
regex = r"\b(\w+)\("
function_calls = [el for el in re.findall(regex, code) if el.islower()]
function_call_counter = Counter(function_calls)
return function_call_counter
def sum_tool_calls(steps):
total_count = Counter()
for step in steps:
if "llm_output" in step:
total_count += extract_tool_calls(step["llm_output"])
return total_count
def get_durations(row):
# start_datetime = datetime.strptime(row['start_time'], "%Y-%m-%d %H:%M:%S")
# end_datetime = datetime.strptime(row['end_time'], "%Y-%m-%d %H:%M:%S")
duration_timedelta = row["end_time"] - row["start_time"]
return int(duration_timedelta.total_seconds())
result_df["duration"] = result_df.apply(get_durations, axis=1)
# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)
result_df["agent_name"].value_counts()
sel_df = result_df
# sel_df = sel_df.loc[
# (result_df["agent_name"].isin(list_versions))
# ]
sel_df = sel_df.reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question", "duration"]]
.agg(
{
"is_correct": "mean",
"is_near_correct": "mean",
"count_steps": "mean",
"question": "count",
"duration": "mean",
}
)
.rename(columns={"question": "count"})
)
import plotly.express as px
cumulative_df = (
(
sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
.expanding(min_periods=1, axis=0, method="single")
.agg({"is_correct": "mean", "is_near_correct": "count"})
.reset_index()
)
.copy()
.rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1
def find_question(row):
try:
res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
return res
except Exception:
return ""
cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)
px.line(
cumulative_df,
color="agent_name",
x="index",
y="is_correct",
hover_data="question",
)
sel_df = result_df.loc[result_df["agent_name"] == "o1"]
print(len(sel_df))
import numpy as np
error_types = [
"AgentParsingError",
"AgentExecutionError",
"AgentMaxIterationsError",
"AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan
def count_errors(row):
if isinstance(row["intermediate_steps"], list):
row["Count steps"] = len(row["intermediate_steps"])
for step in row["intermediate_steps"]:
if isinstance(step, dict) and "error" in step:
try:
row[str(step["error"]["error_type"])] += 1
except Exception:
pass
return row
sel_df = sel_df.apply(count_errors, axis=1)
import plotly.express as px
aggregate_errors = (
sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)
fig = px.bar(
aggregate_errors,
y="value",
x="variable",
color="is_correct",
labels={
"agent_name": "<b>Model</b>",
"task": "<b>Level</b>",
"aggregate_score": "<b>Performance</b>",
"value": "<b>Average count</b>",
"eval_score_GPT4": "<b>Score</b>",
},
)
fig.update_layout(
height=500,
width=800,
barmode="group",
bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()
display(
result_df.groupby(["attachment_type"])[["is_correct", "count_steps", "question"]].agg(
{"is_correct": "mean", "count_steps": "mean", "question": "count"}
)
)
counts = result_df["agent_name"].value_counts()
long_series = result_df.loc[result_df["agent_name"].isin(counts[counts > 140].index)]
def majority_vote(df):
df = df[(df["prediction"] != "Unable to determine") & (~df["prediction"].isna()) & (df["prediction"] != "None")]
answer_modes = df.groupby("question")["prediction"].agg(lambda x: x.mode()[0]).reset_index()
first_occurrences = (
df.groupby(["question", "prediction"]).agg({"task": "first", "is_correct": "first"}).reset_index()
)
result = answer_modes.merge(first_occurrences, on=["question", "prediction"], how="left")
return result
def oracle(df):
def get_first_correct_or_first_wrong(group):
correct_answers = group[group["is_correct"]]
if len(correct_answers) > 0:
return correct_answers.iloc[0]
return group.iloc[0]
result = df.groupby("question").apply(get_first_correct_or_first_wrong)
return result.reset_index(drop=True)
display((long_series.groupby("agent_name")["is_correct"].mean() * 100).round(2))
print(f"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}")
print(f"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}")
agent_run = "code_o1_04_february_submission5.jsonl"
df = pd.read_json(f"output/validation/{agent_run}", lines=True)
df = df[["task_id", "prediction", "intermediate_steps"]]
df = df.rename(columns={"prediction": "model_answer", "intermediate_steps": "reasoning_trace"})
df.to_json("submission.jsonl", orient="records", lines=True)