notebooks/openassistant-oasst1/getting-started.ipynb
# uncomment and run below lines to set up if running in colab
#!pip install datasets pandas treelib
import pandas as pd
from datasets import load_dataset
from treelib import Tree
# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
def add_tree_level(df):
"""helper function to add tree level to a df"""
# if tree level already exists, return df
if "tree_level" in df.columns:
return df
else:
tree_level_map = {}
# iterate over rows in df
for i, row in df.iterrows():
message_id = row["message_id"]
parent_id = row["parent_id"]
# if parent_id is None, then it is a root message
if parent_id is None:
tree_level_map[message_id] = 0
# if parent_id is the same as message_tree_id, then it is a direct reply to the root message
elif parent_id == row["message_tree_id"]:
tree_level_map[message_id] = 1
# else just look up the tree level of the parent_id and add 1
else:
tree_level_map[message_id] = tree_level_map[parent_id] + 1
# create a df from the tree_level_map and merge it with the original df
df_tree_level_map = (
pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
.reset_index()
.rename(columns={"index": "message_id"})
)
return df.merge(df_tree_level_map, on="message_id")
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")
# add tree level to df
df_message_tree = add_tree_level(df_message_tree)
df_message_tree
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 100
# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
# grab the message_id, parent_id, text, and parent text
message_id = row["message_id"]
parent_id = row["parent_id"]
text = row["text"]
text_short = text[:max_char_len] if len(text) > max_char_len else text
text_short = text_short.replace("\n", " ")
parent_text = (
df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
)
parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
parent_text_short = parent_text_short.replace("\n", " ")
# create a node in the id_tree and text_tree, add row as data in case want it later
id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())
# if parent_id is None, then it is a root message so dont add parent text as is none
if parent_id is None:
text_tree.create_node(text_short, text_short)
# else use the parent text short as the parent
else:
text_tree.create_node(text_short, text_short, parent=parent_text_short)
print("id_tree:")
id_tree.show()
print("text_tree:")
text_tree.show()