data/datasets/prosocial_confessions/prosocial-confessions.ipynb
! pip install -q kaggle
! pip install -q sentence_transformers
from datasets import load_dataset, Dataset
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
SBERT_MODEL = "all-MiniLM-L6-v2"
import pandas as pd
import numpy as np
import scipy.spatial as sp
from collections import defaultdict
from tqdm import tqdm
import faiss
from collections import ChainMap
from scipy.special import softmax
import torch
np.set_printoptions(precision=3)
import kaggle
THRESHOLD = 0.72
B_SIZE = 1000
MAXLEN = 196
# download data, you can get your kaggle.json file from your account page https://www.kaggle.com/me/account
kaggle.api.dataset_download_files("pavellexyr/one-million-reddit-confessions", unzip=True)
def load_vectorizer(model=SBERT_MODEL):
return SentenceTransformer(model)
def vectorize_text(model, texts):
return model.encode(texts, show_progress_bar=True)
def build_index(vector):
d = vector.shape[1]
index = faiss.IndexFlatIP(d)
faiss.normalize_L2(rot_vector)
index.add(rot_vector)
return index
def get_conversations(rot_vector, posts_vector):
index = build_index(rot_vector)
num_result = 5
faiss.normalize_L2(posts_vector)
matching_dict = defaultdict(list)
for idx in tqdm(range(0, posts_vector.shape[0], B_SIZE)):
I, D = index.search(posts_vector[idx : idx + B_SIZE], num_result)
sim_indices = np.argwhere(I >= THRESHOLD)
# return sim_indices
for k, j in sim_indices:
matching_dict[k + idx].append({D[k][j]: I[k][j]})
return matching_dict
def get_top_n(data, n=2):
data = dict(ChainMap(*data))
data = {k: v for k, v in sorted(data.items(), key=lambda data: data[1], reverse=True)}
return list(data.keys())[:n]
def convert_to_json(match_dict, all_rots, reddit_df):
all_data = []
posts = reddit_df["title"].values
permalinks = reddit_df["permalink"].values
for post_id, rot_list in match_dict.items():
rot_ids = get_top_n(rot_list)
ps_rots = [all_rots[idx] for idx in rot_ids]
all_data.append(
{"context": posts[post_id], "rots": ps_rots, "source": permalinks[post_id], "episone_done": True}
)
return all_data
def load_confessions(path):
conf_df = pd.read_csv(path)
conf_df_filtered = conf_df.query("score>3.0")
conf_df_filtered["title_len"] = conf_df_filtered["title"].map(lambda x: len(x.split()))
conf_df_filtered = conf_df_filtered.query("title_len>5")
return conf_df_filtered.reset_index(drop=True)
def get_rots(dataset):
rots = [item["rots"] for item in dataset]
rots = set([x for item in rots for x in item])
rots = list(rots)
return rots
conf_df_filtered = load_confessions("/home/shahul/Data/one-million-reddit-confessions.csv")
dataset = load_dataset("allenai/prosocial-dialog", split="train")
rots = get_rots(dataset)
model = load_vectorizer()
rot_vector = vectorize_text(model, rots)
posts = conf_df_filtered["title"].values.tolist()
posts_vector = vectorize_text(model, posts)
match_dict = get_conversations(rot_vector, posts_vector)
json_data = convert_to_json(match_dict, rots, conf_df_filtered)
from datasets import Dataset
confession_dataset = Dataset.from_list(json_data)
confession_dataset
label_to_id = {
"__casual__": 0,
"__needs_caution__": 1,
"__needs_intervention__": 2,
"__probably_needs_caution__": 3,
"__possibly_needs_caution__": 4,
}
id_to_label = {v: k for k, v in label_to_id.items()}
from torch.utils.data import Dataset, DataLoader
class ProSocialDataset(Dataset):
def __init__(self, dataset, tokenizer):
super().__init__()
self.tokenizer = tokenizer
self.sep_token = self.tokenizer.sep_token
self.dataset = dataset
self.label2id = label_to_id
self.id2label = {v: k for k, v in label_to_id.items()}
self.max_len = MAXLEN
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
encoding = {}
context = self.dataset[idx]["context"]
rots = self.dataset[idx]["rots"]
input_tokens = self.tokenizer.encode(self.dataset[idx]["context"], add_special_tokens=False)
max_len = max(0, self.max_len - (len(input_tokens) + 4))
rots = self.tokenizer.encode(
self.tokenizer.sep_token.join(rots),
add_special_tokens=False,
max_length=max_len,
)
input_ids = [0] + input_tokens + [self.tokenizer.sep_token_id] + rots + [self.tokenizer.eos_token_id]
input_ids = input_ids + [self.tokenizer.pad_token_id] * max(0, (self.max_len - len(input_ids)))
mask = [1] * len(input_ids) + [self.tokenizer.pad_token_id] * (self.max_len - len(input_ids))
encoding["input_ids"] = torch.tensor(input_ids)
encoding["attention_mask"] = torch.tensor(mask)
return encoding
MODEL = "shahules786/prosocial-classifier"
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def predict(pred_dataloader):
probs, labels = [], []
for data in tqdm(pred_dataloader):
pred = softmax(
model(data["input_ids"], data["attention_mask"]).logits.detach().numpy().astype("float16"),
axis=1,
)
probs.append(pred.max(axis=1))
labels.append([model.config.id2label.get(pred_id) for pred_id in pred.argmax(axis=1)])
return np.concatenate(probs), np.concatenate(labels)
pred_dataset = ProSocialDataset(confession_dataset, tokenizer)
pred_dataset = DataLoader(pred_dataset, batch_size=4)
prob, label = predict(pred_dataset)
confession_dataset = confession_dataset.add_column("confidence", np.array(prob, dtype="float32"))
confession_dataset = confession_dataset.add_column("safety_label", label)
confession_dataset = confession_dataset.add_column("response", [None] * len(confession_dataset))
confession_dataset.push_to_hub("shahules786/prosocial-confessions")