python

# autoreload
%load_ext autoreload
%autoreload 2

python

import praw
import prawcore
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm
import pickle
from random import sample, choice
import concurrent.futures

List of subreddits to scrape

this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.

taking these for now

python

subs = [
    "Sexpolls",
    "sexpositions",
    "Sexconfessional",
    "penissize",
    "masturbation",
    "AskRedditNSFW",
    "sexstories",
    "SexFantasies",
    "sexconfession",
    "askwoman_aboutsex",
    "sextips",
    "sexualhealth",
    "SexPositive",
    "DirtyConfession",
    "Puberty",
    "NSFWIAMA",
    "sexover30",
    "SexToys",
    "sexquestions",
    "deepvaginaproblems",
    "kegels",
    "sexeducation",
    "ColoredLang",
    "masterbationstories",
    "RedditAfterDark",
    "Threesome_advice",
]

# "NSFWIAMA" -> special one (TODO f)

Scrap these Subreddits

python

from utils import scrap_subreddit, init_praw_reddit, save_to_huggingface

python

reddit = init_praw_reddit()

# loop through subreddits and save to an intermediate folder
for sub in tqdm(subs):
    try:
        df = scrape_subreddit(sub, reddit)
        if df is not None:
            file_name = f"dataframes/{sub}.csv"
            df.to_csv(file_name, index=False)
            print("subreddit saved to: ", file_name)
    except Exception as e:
        logger.error(f"Error scraping {sub}: {e}")

python

import os

# clean the results
files = os.listdir("dataframes/")
dfs = []
for file in files:
    df = pd.read_csv(f"dataframes/{file}")
    dfs.append(df[df["is_question"] & df["is_self"]])

cleaned_df = pd.concat(dfs)
print(cleaned_df.shape, cleaned_df["subreddit"].value_counts())

python

save_to_huggingface(cleaned_df, name="jjmachan/NSFW-questions-inter-cleaned_df")

Filter with prosocial

The prosocial.ipynb has the code that converts the clean_df intermediate dataset stored in jjmachan/NSFW-questions-inter-cleaned_df

Get Comments

python

get_comments(cleaned_df["post_id"])

python

nsfw_with_comments = pd.read_csv("df_with_comments.csv").drop_duplicates(subset=["post_id"])
nsfw = pd.read_csv("nsfw.csv").drop_duplicates(subset=["post_id"])
nsfw_final = pd.merge(nsfw.drop(columns=[f"C{i+1}" for i in range(5)]), nsfw_with_comments, on="post_id")
print(nsfw.shape, nsfw_with_comments.shape, nsfw_final.shape)

python

nsfw_final.sample(15)

Save to Huggingface

python

from utils import save_to_huggingface

save_to_huggingface(nsfw_final, name="jjmachan/NSFW-questions")