Back to Open Assistant

Prosocial Confessions

data/datasets/prosocial_confessions/prosocial-confessions.ipynb

0.0.17.2 KB
Original Source

Introduction

Imports

python
! pip install -q kaggle
! pip install -q sentence_transformers
python
from datasets import load_dataset, Dataset
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

SBERT_MODEL = "all-MiniLM-L6-v2"
import pandas as pd
import numpy as np
import scipy.spatial as sp
from collections import defaultdict
from tqdm import tqdm
import faiss
from collections import ChainMap
from scipy.special import softmax
import torch

np.set_printoptions(precision=3)
import kaggle
python
THRESHOLD = 0.72
B_SIZE = 1000
MAXLEN = 196
python
# download data, you can get your kaggle.json file from your account page https://www.kaggle.com/me/account
kaggle.api.dataset_download_files("pavellexyr/one-million-reddit-confessions", unzip=True)

Functions

python
def load_vectorizer(model=SBERT_MODEL):
    return SentenceTransformer(model)


def vectorize_text(model, texts):
    return model.encode(texts, show_progress_bar=True)


def build_index(vector):
    d = vector.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(rot_vector)
    index.add(rot_vector)
    return index


def get_conversations(rot_vector, posts_vector):
    index = build_index(rot_vector)
    num_result = 5
    faiss.normalize_L2(posts_vector)
    matching_dict = defaultdict(list)
    for idx in tqdm(range(0, posts_vector.shape[0], B_SIZE)):
        I, D = index.search(posts_vector[idx : idx + B_SIZE], num_result)
        sim_indices = np.argwhere(I >= THRESHOLD)
        # return sim_indices
        for k, j in sim_indices:
            matching_dict[k + idx].append({D[k][j]: I[k][j]})

    return matching_dict


def get_top_n(data, n=2):
    data = dict(ChainMap(*data))
    data = {k: v for k, v in sorted(data.items(), key=lambda data: data[1], reverse=True)}
    return list(data.keys())[:n]


def convert_to_json(match_dict, all_rots, reddit_df):
    all_data = []
    posts = reddit_df["title"].values
    permalinks = reddit_df["permalink"].values
    for post_id, rot_list in match_dict.items():
        rot_ids = get_top_n(rot_list)
        ps_rots = [all_rots[idx] for idx in rot_ids]
        all_data.append(
            {"context": posts[post_id], "rots": ps_rots, "source": permalinks[post_id], "episone_done": True}
        )

    return all_data


def load_confessions(path):
    conf_df = pd.read_csv(path)
    conf_df_filtered = conf_df.query("score>3.0")
    conf_df_filtered["title_len"] = conf_df_filtered["title"].map(lambda x: len(x.split()))
    conf_df_filtered = conf_df_filtered.query("title_len>5")
    return conf_df_filtered.reset_index(drop=True)


def get_rots(dataset):
    rots = [item["rots"] for item in dataset]
    rots = set([x for item in rots for x in item])
    rots = list(rots)
    return rots

read data

python
conf_df_filtered = load_confessions("/home/shahul/Data/one-million-reddit-confessions.csv")
python
dataset = load_dataset("allenai/prosocial-dialog", split="train")

Embed and match ROTs with posts

python
rots = get_rots(dataset)
python
model = load_vectorizer()
python
rot_vector = vectorize_text(model, rots)
python
posts = conf_df_filtered["title"].values.tolist()
posts_vector = vectorize_text(model, posts)
python
match_dict = get_conversations(rot_vector, posts_vector)
python
json_data = convert_to_json(match_dict, rots, conf_df_filtered)
python
from datasets import Dataset
python
confession_dataset = Dataset.from_list(json_data)
python
confession_dataset

Pseudo label

Dataset loader

python
label_to_id = {
    "__casual__": 0,
    "__needs_caution__": 1,
    "__needs_intervention__": 2,
    "__probably_needs_caution__": 3,
    "__possibly_needs_caution__": 4,
}
id_to_label = {v: k for k, v in label_to_id.items()}
python
from torch.utils.data import Dataset, DataLoader


class ProSocialDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        self.sep_token = self.tokenizer.sep_token
        self.dataset = dataset
        self.label2id = label_to_id
        self.id2label = {v: k for k, v in label_to_id.items()}
        self.max_len = MAXLEN

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        encoding = {}

        context = self.dataset[idx]["context"]
        rots = self.dataset[idx]["rots"]
        input_tokens = self.tokenizer.encode(self.dataset[idx]["context"], add_special_tokens=False)
        max_len = max(0, self.max_len - (len(input_tokens) + 4))

        rots = self.tokenizer.encode(
            self.tokenizer.sep_token.join(rots),
            add_special_tokens=False,
            max_length=max_len,
        )

        input_ids = [0] + input_tokens + [self.tokenizer.sep_token_id] + rots + [self.tokenizer.eos_token_id]
        input_ids = input_ids + [self.tokenizer.pad_token_id] * max(0, (self.max_len - len(input_ids)))
        mask = [1] * len(input_ids) + [self.tokenizer.pad_token_id] * (self.max_len - len(input_ids))

        encoding["input_ids"] = torch.tensor(input_ids)
        encoding["attention_mask"] = torch.tensor(mask)

        return encoding

Load Model

python
MODEL = "shahules786/prosocial-classifier"
python
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

Inference

python
def predict(pred_dataloader):
    probs, labels = [], []
    for data in tqdm(pred_dataloader):
        pred = softmax(
            model(data["input_ids"], data["attention_mask"]).logits.detach().numpy().astype("float16"),
            axis=1,
        )
        probs.append(pred.max(axis=1))
        labels.append([model.config.id2label.get(pred_id) for pred_id in pred.argmax(axis=1)])

    return np.concatenate(probs), np.concatenate(labels)
python
pred_dataset = ProSocialDataset(confession_dataset, tokenizer)
python
pred_dataset = DataLoader(pred_dataset, batch_size=4)
python
prob, label = predict(pred_dataset)

Add labels to dataset

python
confession_dataset = confession_dataset.add_column("confidence", np.array(prob, dtype="float32"))
confession_dataset = confession_dataset.add_column("safety_label", label)
confession_dataset = confession_dataset.add_column("response", [None] * len(confession_dataset))

Upload to Hub

python
confession_dataset.push_to_hub("shahules786/prosocial-confessions")