Safety Data Augmentation - Open Assistant

Imports

python

! pip install -q sentence_transformers

python

import json
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

SBERT_MODEL = "all-MiniLM-L6-v2"
import pandas as pd
import numpy as np

Data

Load data from red-teaming and prosocial-dialog

python

red_teaming = load_dataset("Anthropic/hh-rlhf", data_dir="red-team-attempts", split="train")
pro_social_data = load_dataset("allenai/prosocial-dialog", split="train")

prepare dict matching ROT to safety label

python

def match_rot_safetylabels():
    rots = [item["rots"] for item in pro_social_data]
    safety_annotations = [item["safety_label"] for item in pro_social_data]
    results = {}
    for rots, sfty in zip(rots, safety_annotations):
        for rot in rots:
            if rot not in results.keys():
                results[rot] = sfty
    return results

python

rot_sfty = match_rot_safetylabels()

Embedding

Embed ROTs from pro-social dataset and Task description from red-teaming data

python

def load_vectorizer(model=SBERT_MODEL):
    return SentenceTransformer(model)


def vectorize_text(model, texts):
    return model.encode(texts, show_progress_bar=True)

python

model = load_vectorizer()

python

task_descriptions = [item["task_description"] for item in red_teaming]

python

rot_vector = vectorize_text(model, list(rot_sfty.keys()))
task_desp_vector = vectorize_text(model, task_descriptions)

cosine similarity

Cosine similarity between ROTs & task description
Done in batches due to size of data

python

import scipy.spatial as sp
from collections import defaultdict
from tqdm import tqdm

THRESHOLD = 0.72
B_SIZE = 500


def get_conversations(rot_vector, task_desp_vector):
    matching_dict = defaultdict(list)
    for i in tqdm(range(0, rot_vector.shape[0], B_SIZE)):
        cosine_sim = 1 - sp.distance.cdist(rot_vector[i : i + B_SIZE, :], task_desp_vector, "cosine")
        sim_indices = np.argwhere(cosine_sim >= THRESHOLD)
        for rot_index, red_index in sim_indices:
            matching_dict[rot_index + i].append(red_index)

    return matching_dict

python

matching_dict = get_conversations(rot_vector[:], task_desp_vector)

python

print("Number of matching samples: ", sum([len(x) for x in matching_dict.values()]))

Convert red-teaming data to prosocial format

Red-teaming data is labelled for the whole conversation
Pro-social data is labelled per dialog
The episode_done flag indicates the end of a conversation

python

def convert_red(red_teaming):
    all_conversations = []
    for item in red_teaming:
        transcript = [
            dialog for dialog in item["transcript"].split("\n\n") if dialog.startswith(("Human", "Assistant"))
        ]
        for i in range(0, len(transcript) + 1, 2):
            conversation = transcript[i : i + 2]
            conversation = {
                dialog.split(":")[0].strip(): ":".join(dialog.split(":")[1:]).strip() for dialog in conversation
            }
            if set(conversation.keys()) == set(["Human", "Assistant"]):
                conversation["episode_done"] = i == len(transcript) - 2
                conversation["task_description"] = item["task_description"]
                conversation["min_harmlessness_score_transcript"] = item["min_harmlessness_score_transcript"]
                conversation["task_descripton_harmlessness_score"] = item["task_descripton_harmlessness_score"]
                all_conversations.append(conversation)

    return all_conversations

python

print(
    f"Match found for ROT : {list(rot_sfty.keys())[1]} \nConversations:\n {convert_red([red_teaming[int(idx)] for idx in matching_dict[1]])}"
)

python

red_teaming_df = convert_red(red_teaming)

python

pd.DataFrame.from_records(red_teaming_df[:100]).head(5)

Inference

Now we have

red-teaming data samples with highly similar task description as ROTs in prosocial dataset
the red-teaming data in same format as in prosocial dataset.

Use pretrained safety label classifier to infer safety labels for red-teaming conversations

Dataset preparation

python

MAXLEN = 128
from torch.utils.data import Dataset, DataLoader


class ProSocialDataset(Dataset):
    def __init__(self, dataset, rot):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.sep_token = self.tokenizer.sep_token
        self.dataset = dataset
        self.rot = rot

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        context = self.dataset[idx]
        idx_start = idx
        end = self.dataset[max(0, idx_start - 1)]["episode_done"]
        while (not end) and (idx_start > 0):
            end = self.dataset[max(0, idx_start - 2)]["episode_done"]
            idx_start -= 1
        idx_start = max(0, idx_start)

        prev_context = [f'{self.dataset[i]["Human"]}' for i in range(idx_start, idx)]
        rots = [self.rot]
        context = (
            f'{self.dataset[idx]["Human"]}' + self.sep_token + "".join(prev_context) + self.sep_token + "".join(rots)
        )

        encoding = self.tokenizer(
            context,
            max_length=MAXLEN,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return encoding

Run inference

python

from scipy.special import softmax

np.set_printoptions(precision=3)

python

from transformers import AutoTokenizer, AutoModelForSequenceClassification

python

model = AutoModelForSequenceClassification.from_pretrained("shahules786/prosocial-classifier")

python

def pseudo_label(red_teaming, matching_dict, rot_list):
    dfs = []
    for i in tqdm(range(len(matching_dict))):
        red_teaming_data = convert_red([red_teaming[int(idx)] for idx in matching_dict[i]])
        rot = rot_list[i]
        if len(red_teaming_data) > 1:
            pred_dataloader = prepare_dataloader(red_teaming_data, rot)
            probs, labels = predict(pred_dataloader)
            df_ = prepare_dataframe(red_teaming_data, probs, labels, rot)
            dfs.append(df_)
    return pd.concat(dfs)

python

def predict(pred_dataloader):
    probs, labels = [], []
    for data in pred_dataloader:
        pred = softmax(
            model(data["input_ids"].squeeze(1), data["attention_mask"].squeeze(1))
            .logits.detach()
            .numpy()
            .astype("float16"),
            axis=1,
        )
        # pred_ids.append(pred.argmax(axis=1))
        probs.append(pred.max(axis=1))
        labels.append([model.config.id2label.get(pred_id) for pred_id in pred.argmax(axis=1)])

    return np.concatenate(probs), np.concatenate(labels)


def prepare_dataframe(red_teaming_data, probs, labels, rot):
    red_teaming_df = pd.DataFrame.from_records(red_teaming_data)
    red_teaming_df["safety_label"] = labels
    red_teaming_df["confidence"] = probs
    red_teaming_df["rots"] = [[rot]] * red_teaming_df.shape[0]
    return red_teaming_df


def prepare_dataloader(red_teaming_data, rot):
    pred_dataset = ProSocialDataset(red_teaming_data, rot)
    pred_dataloader = DataLoader(pred_dataset, batch_size=8)
    return pred_dataloader

python

output_df = pseudo_label(red_teaming, matching_dict, list(rot_sfty.keys()))

python

output_df.shape

python

output_df.head(5)

Save and upload data to HF hub

python

save_path = "/home/shahul/Data/red_teaming_augmented.json"
output_df.to_json(save_path, orient="records")

python

hf_dataset = load_dataset("json", data_files=save_path)

python

hf_dataset.push_to_hub("shahules786/prosocial_augmented")