data/datasets/nsfw_selfharm_reddit/dataset-cookbook.ipynb
# autoreload
%load_ext autoreload
%autoreload 2
import praw
import prawcore
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm
import pickle
from random import sample, choice
import concurrent.futures
this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.
taking these for now
subs = [
"Sexpolls",
"sexpositions",
"Sexconfessional",
"penissize",
"masturbation",
"AskRedditNSFW",
"sexstories",
"SexFantasies",
"sexconfession",
"askwoman_aboutsex",
"sextips",
"sexualhealth",
"SexPositive",
"DirtyConfession",
"Puberty",
"NSFWIAMA",
"sexover30",
"SexToys",
"sexquestions",
"deepvaginaproblems",
"kegels",
"sexeducation",
"ColoredLang",
"masterbationstories",
"RedditAfterDark",
"Threesome_advice",
]
# "NSFWIAMA" -> special one (TODO f)
from utils import scrap_subreddit, init_praw_reddit, save_to_huggingface
reddit = init_praw_reddit()
# loop through subreddits and save to an intermediate folder
for sub in tqdm(subs):
try:
df = scrape_subreddit(sub, reddit)
if df is not None:
file_name = f"dataframes/{sub}.csv"
df.to_csv(file_name, index=False)
print("subreddit saved to: ", file_name)
except Exception as e:
logger.error(f"Error scraping {sub}: {e}")
import os
# clean the results
files = os.listdir("dataframes/")
dfs = []
for file in files:
df = pd.read_csv(f"dataframes/{file}")
dfs.append(df[df["is_question"] & df["is_self"]])
cleaned_df = pd.concat(dfs)
print(cleaned_df.shape, cleaned_df["subreddit"].value_counts())
save_to_huggingface(cleaned_df, name="jjmachan/NSFW-questions-inter-cleaned_df")
The prosocial.ipynb has the code that converts the clean_df intermediate dataset stored in jjmachan/NSFW-questions-inter-cleaned_df
get_comments(cleaned_df["post_id"])
nsfw_with_comments = pd.read_csv("df_with_comments.csv").drop_duplicates(subset=["post_id"])
nsfw = pd.read_csv("nsfw.csv").drop_duplicates(subset=["post_id"])
nsfw_final = pd.merge(nsfw.drop(columns=[f"C{i+1}" for i in range(5)]), nsfw_with_comments, on="post_id")
print(nsfw.shape, nsfw_with_comments.shape, nsfw_final.shape)
nsfw_final.sample(15)
from utils import save_to_huggingface
save_to_huggingface(nsfw_final, name="jjmachan/NSFW-questions")