notebooks/data-augmentation/changemyview-builder/data_processor.ipynb
Converts subreddit data into readable format for ML training
### REMEMBER: setup the .env before running this code!
"""CONSTANTS"""
# Set the head number to the amount of entries you want to load in minus one
ENTRIES_COUNT = 10
# Set the threshold for toxic comments to be removed
TOXIC_THRESHOLD = 0.95
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow
!pip install detoxify
!pip install tqdm
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main
# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))
reddit = praw.Reddit(
client_id=os.environ.get("CLIENT_ID"),
client_secret=os.environ.get("CLIENT_SECRET"),
user_agent="CMV_Scraper",
)
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO
import numpy as np
fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname
# download if not exists
if not os.path.isfile(fname):
f = BytesIO()
with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
data = resp.read()
f_disk.write(data) # save to disk too
f.write(data)
f.seek(0)
else:
f = open(fname, 'rb')
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")
# Extract the file we are interested in
train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"
train_bzlist = tar.extractfile(train_fname)
# Deserialize the JSON list
original_posts_train = [
json.loads(line.decode('utf-8'))
for line in BZ2File(train_bzlist)
]
original_posts_train[:1]
# Load the jsonlist file into a dataframe
#df = pd.read_json(original_posts_train, orient='list', lines=True)
df = pd.DataFrame(original_posts_train)
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
try:
submission = reddit.submission(id=post_id)
submission.name
return True
except Exception as e:
return False
# Set up the detoxifier model:
from detoxify import Detoxify
import re
# Removes > sign and the template message at the end of a message
def cleanup_body_text(cmv_post):
lines = [line for line in cmv_post.splitlines()
if not line.lstrip().startswith(">")
and not line.lstrip().startswith("____")
and not line.lstrip().startswith("So go forth and CMV, noble redditors!")
and "edit" not in " ".join(line.lower().split()[:2])
]
return "\n".join(lines)
# Create the function that will be handling all the data gathering
def get_top_comment_and_clean_data(post_id):
#print(post_id.lstrip("t3_"))
last_author = ""
# Grab the post
submission = reddit.submission(id=post_id.lstrip("t3_"))
#print(submission.title)
# Grab the highest rated comment on root layer
submission.submission_type = 'best'
submission.comments.replace_more(limit=0)
replies = list(submission.comments)[0].replies.list()
# Just some variables
pros = []
# If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)
if type(submission.author) == type(None):
last_author = "[deleted]"
else:
last_author = submission.author.name
is_pro_argument = False
for comment in replies:
# If redditor object doesn't exist, the account is invalid/deleted
if type(comment.author) != type(None):
author = comment.author.name
else:
author = "[deleted]"
# Assume that whenever the user changes, they are countering the previous person
if author != last_author:
is_pro_argument = !is_pro_argument
if author == "[deleted]" or author=="DeltaBot":
#print("Skipping comment...")
continue
# Remove meta and duplicate comments
comment.body = " ".join([line for line in comment.body.splitlines()
if not re.search(r"(?i)(Change\smy\sview|CMV)", line)
and line not in pros # Why doesn't this line work
])
# Sometimes for some reason duplicate entries exist
# Also remove automated message with "Δ" in it
if comment.body in pros:
#print("Skipping duplicate entry")
continue
#print("\t\t>>\t",comment.body)
# Remove toxic comments
if Detoxify("multilingual").predict(comment.body)["toxicity"] > TOXIC_THRESHOLD:
#print("Identified toxic comment, ignoring...")
comment.body = ""
# Add to the respective argument type
if is_pro_argument:
pros.append(comment.body)
last_author = comment.author.name
# Pros = arguments for the Title of this post
# Cons = arguments against the title of this post
pros.append(comment.body)
return pros
print(f"Loading in {ENTRIES_COUNT} posts")
dataset = df.head(ENTRIES_COUNT)
# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name
import warnings
warnings.filterwarnings('ignore')
dataset["post_id"] = dataset["name"]
warnings.filterwarnings('default')
%%time
from tqdm.auto import tqdm
# Reset variables for if we run this multiple times
all_pros = []
all_names = []
all_titles = []
all_sources = []
print("Loading in data... This will take a while.")
for i in tqdm(range(dataset.shape[0])):
post = dataset.iloc[i]
modified_title = post.title.replace('CMV', "Change my mind")
#print(f"\n Loading entry {i+1}/{dataset.shape[0]}:\n\t\"{modified_title}\"")
if type(post) == type(None):
continue
assert(post.post_id != i)
pros = get_top_comment_and_clean_data(post.post_id)
if post.title == "[deleted]":
continue
pros = " ".join([*set(pros)])
pros = pros.replace("[deleted]","")
post.selftext = cleanup_body_text(post.selftext)
all_titles.append(modified_title + " " + post.selftext)
all_pros.append(pros)
all_names.append(post.name)
all_sources.append(f"https://reddit.com/r/changemyview/comments/{post.post_id}")
#print(post.title)
all_pros[1]
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
"INSTRUCTION": all_titles,
"RESPONSE": all_pros,
"SOURCE": all_sources
}, index=all_names
)
# Create Apache Paquete file
import pyarrow as pa
import pyarrow.parquet as pq
table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")
# Test to see if it was successful
table = pq.read_table("output.parquet")
table.to_pandas()