notebooks/data-augmentation/anthropic/safety data-augmentation.ipynb
! pip install -q sentence_transformers
import json
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
SBERT_MODEL = "all-MiniLM-L6-v2"
import pandas as pd
import numpy as np
red_teaming = load_dataset("Anthropic/hh-rlhf", data_dir="red-team-attempts", split="train")
pro_social_data = load_dataset("allenai/prosocial-dialog", split="train")
def match_rot_safetylabels():
rots = [item["rots"] for item in pro_social_data]
safety_annotations = [item["safety_label"] for item in pro_social_data]
results = {}
for rots, sfty in zip(rots, safety_annotations):
for rot in rots:
if rot not in results.keys():
results[rot] = sfty
return results
rot_sfty = match_rot_safetylabels()
def load_vectorizer(model=SBERT_MODEL):
return SentenceTransformer(model)
def vectorize_text(model, texts):
return model.encode(texts, show_progress_bar=True)
model = load_vectorizer()
task_descriptions = [item["task_description"] for item in red_teaming]
rot_vector = vectorize_text(model, list(rot_sfty.keys()))
task_desp_vector = vectorize_text(model, task_descriptions)
import scipy.spatial as sp
from collections import defaultdict
from tqdm import tqdm
THRESHOLD = 0.72
B_SIZE = 500
def get_conversations(rot_vector, task_desp_vector):
matching_dict = defaultdict(list)
for i in tqdm(range(0, rot_vector.shape[0], B_SIZE)):
cosine_sim = 1 - sp.distance.cdist(rot_vector[i : i + B_SIZE, :], task_desp_vector, "cosine")
sim_indices = np.argwhere(cosine_sim >= THRESHOLD)
for rot_index, red_index in sim_indices:
matching_dict[rot_index + i].append(red_index)
return matching_dict
matching_dict = get_conversations(rot_vector[:], task_desp_vector)
print("Number of matching samples: ", sum([len(x) for x in matching_dict.values()]))
episode_done flag indicates the end of a conversationdef convert_red(red_teaming):
all_conversations = []
for item in red_teaming:
transcript = [
dialog for dialog in item["transcript"].split("\n\n") if dialog.startswith(("Human", "Assistant"))
]
for i in range(0, len(transcript) + 1, 2):
conversation = transcript[i : i + 2]
conversation = {
dialog.split(":")[0].strip(): ":".join(dialog.split(":")[1:]).strip() for dialog in conversation
}
if set(conversation.keys()) == set(["Human", "Assistant"]):
conversation["episode_done"] = i == len(transcript) - 2
conversation["task_description"] = item["task_description"]
conversation["min_harmlessness_score_transcript"] = item["min_harmlessness_score_transcript"]
conversation["task_descripton_harmlessness_score"] = item["task_descripton_harmlessness_score"]
all_conversations.append(conversation)
return all_conversations
print(
f"Match found for ROT : {list(rot_sfty.keys())[1]} \nConversations:\n {convert_red([red_teaming[int(idx)] for idx in matching_dict[1]])}"
)
red_teaming_df = convert_red(red_teaming)
pd.DataFrame.from_records(red_teaming_df[:100]).head(5)
Now we have
Next
MAXLEN = 128
from torch.utils.data import Dataset, DataLoader
class ProSocialDataset(Dataset):
def __init__(self, dataset, rot):
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
self.sep_token = self.tokenizer.sep_token
self.dataset = dataset
self.rot = rot
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
context = self.dataset[idx]
idx_start = idx
end = self.dataset[max(0, idx_start - 1)]["episode_done"]
while (not end) and (idx_start > 0):
end = self.dataset[max(0, idx_start - 2)]["episode_done"]
idx_start -= 1
idx_start = max(0, idx_start)
prev_context = [f'{self.dataset[i]["Human"]}' for i in range(idx_start, idx)]
rots = [self.rot]
context = (
f'{self.dataset[idx]["Human"]}' + self.sep_token + "".join(prev_context) + self.sep_token + "".join(rots)
)
encoding = self.tokenizer(
context,
max_length=MAXLEN,
add_special_tokens=True,
truncation=True,
padding="max_length",
return_tensors="pt",
)
return encoding
from scipy.special import softmax
np.set_printoptions(precision=3)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("shahules786/prosocial-classifier")
def pseudo_label(red_teaming, matching_dict, rot_list):
dfs = []
for i in tqdm(range(len(matching_dict))):
red_teaming_data = convert_red([red_teaming[int(idx)] for idx in matching_dict[i]])
rot = rot_list[i]
if len(red_teaming_data) > 1:
pred_dataloader = prepare_dataloader(red_teaming_data, rot)
probs, labels = predict(pred_dataloader)
df_ = prepare_dataframe(red_teaming_data, probs, labels, rot)
dfs.append(df_)
return pd.concat(dfs)
def predict(pred_dataloader):
probs, labels = [], []
for data in pred_dataloader:
pred = softmax(
model(data["input_ids"].squeeze(1), data["attention_mask"].squeeze(1))
.logits.detach()
.numpy()
.astype("float16"),
axis=1,
)
# pred_ids.append(pred.argmax(axis=1))
probs.append(pred.max(axis=1))
labels.append([model.config.id2label.get(pred_id) for pred_id in pred.argmax(axis=1)])
return np.concatenate(probs), np.concatenate(labels)
def prepare_dataframe(red_teaming_data, probs, labels, rot):
red_teaming_df = pd.DataFrame.from_records(red_teaming_data)
red_teaming_df["safety_label"] = labels
red_teaming_df["confidence"] = probs
red_teaming_df["rots"] = [[rot]] * red_teaming_df.shape[0]
return red_teaming_df
def prepare_dataloader(red_teaming_data, rot):
pred_dataset = ProSocialDataset(red_teaming_data, rot)
pred_dataloader = DataLoader(pred_dataset, batch_size=8)
return pred_dataloader
output_df = pseudo_label(red_teaming, matching_dict, list(rot_sfty.keys()))
output_df.shape
output_df.head(5)
save_path = "/home/shahul/Data/red_teaming_augmented.json"
output_df.to_json(save_path, orient="records")
hf_dataset = load_dataset("json", data_files=save_path)
hf_dataset.push_to_hub("shahules786/prosocial_augmented")