Back to Open Assistant

autoreload

data/datasets/nsfw_selfharm_reddit/dataset-cookbook.ipynb

0.0.12.6 KB
Original Source
python
# autoreload
%load_ext autoreload
%autoreload 2
python
import praw
import prawcore
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm
import pickle
from random import sample, choice
import concurrent.futures

List of subreddits to scrape

this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.

taking these for now

python
subs = [
    "Sexpolls",
    "sexpositions",
    "Sexconfessional",
    "penissize",
    "masturbation",
    "AskRedditNSFW",
    "sexstories",
    "SexFantasies",
    "sexconfession",
    "askwoman_aboutsex",
    "sextips",
    "sexualhealth",
    "SexPositive",
    "DirtyConfession",
    "Puberty",
    "NSFWIAMA",
    "sexover30",
    "SexToys",
    "sexquestions",
    "deepvaginaproblems",
    "kegels",
    "sexeducation",
    "ColoredLang",
    "masterbationstories",
    "RedditAfterDark",
    "Threesome_advice",
]

# "NSFWIAMA" -> special one (TODO f)

Scrap these Subreddits

python
from utils import scrap_subreddit, init_praw_reddit, save_to_huggingface
python
reddit = init_praw_reddit()

# loop through subreddits and save to an intermediate folder
for sub in tqdm(subs):
    try:
        df = scrape_subreddit(sub, reddit)
        if df is not None:
            file_name = f"dataframes/{sub}.csv"
            df.to_csv(file_name, index=False)
            print("subreddit saved to: ", file_name)
    except Exception as e:
        logger.error(f"Error scraping {sub}: {e}")
python
import os

# clean the results
files = os.listdir("dataframes/")
dfs = []
for file in files:
    df = pd.read_csv(f"dataframes/{file}")
    dfs.append(df[df["is_question"] & df["is_self"]])

cleaned_df = pd.concat(dfs)
print(cleaned_df.shape, cleaned_df["subreddit"].value_counts())
python
save_to_huggingface(cleaned_df, name="jjmachan/NSFW-questions-inter-cleaned_df")

Filter with prosocial

The prosocial.ipynb has the code that converts the clean_df intermediate dataset stored in jjmachan/NSFW-questions-inter-cleaned_df

Get Comments

python
get_comments(cleaned_df["post_id"])
python
nsfw_with_comments = pd.read_csv("df_with_comments.csv").drop_duplicates(subset=["post_id"])
nsfw = pd.read_csv("nsfw.csv").drop_duplicates(subset=["post_id"])
nsfw_final = pd.merge(nsfw.drop(columns=[f"C{i+1}" for i in range(5)]), nsfw_with_comments, on="post_id")
print(nsfw.shape, nsfw_with_comments.shape, nsfw_final.shape)
python
nsfw_final.sample(15)

Save to Huggingface

python
from utils import save_to_huggingface

save_to_huggingface(nsfw_final, name="jjmachan/NSFW-questions")