Back to Open Assistant

r/ChangeMyView data converter

notebooks/data-augmentation/changemyview-builder/data_processor.ipynb

0.0.17.3 KB
Original Source

r/ChangeMyView data converter

Converts subreddit data into readable format for ML training

python
### REMEMBER: setup the .env before running this code!

"""CONSTANTS"""

# Set the head number to the amount of entries you want to load in minus one
ENTRIES_COUNT = 10

# Set the threshold for toxic comments to be removed
TOXIC_THRESHOLD = 0.95
python
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow
!pip install detoxify
!pip install tqdm
python
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main

# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))

reddit = praw.Reddit(
   client_id=os.environ.get("CLIENT_ID"),
   client_secret=os.environ.get("CLIENT_SECRET"),
   user_agent="CMV_Scraper",
)

python
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')



python
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)
python
# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]
python
original_posts_train[:1]
python
# Load the jsonlist file into a dataframe
#df = pd.read_json(original_posts_train, orient='list', lines=True)
df = pd.DataFrame(original_posts_train)
python
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
    try:
        submission = reddit.submission(id=post_id)
        submission.name
        return True
    except Exception as e:
        return False
python
# Set up the detoxifier model:
from detoxify import Detoxify
python
import re

# Removes > sign and the template message at the end of a message
def cleanup_body_text(cmv_post):
    lines = [line for line in cmv_post.splitlines()
            if not line.lstrip().startswith(">")
            and not line.lstrip().startswith("____")
            and not line.lstrip().startswith("So go forth and CMV, noble redditors!")
            and "edit" not in " ".join(line.lower().split()[:2])
            ]
    return "\n".join(lines)




# Create the function that will be handling all the data gathering
def get_top_comment_and_clean_data(post_id):
    #print(post_id.lstrip("t3_"))
    last_author = ""
    # Grab the post
    submission = reddit.submission(id=post_id.lstrip("t3_"))
    #print(submission.title)

    # Grab the highest rated comment on root layer
    submission.submission_type = 'best'
    submission.comments.replace_more(limit=0)
    replies = list(submission.comments)[0].replies.list()

    # Just some variables
    pros = []

    # If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)
    if type(submission.author) == type(None):
        last_author = "[deleted]"
    else:
        last_author = submission.author.name

    is_pro_argument = False

    for comment in replies:

        # If redditor object doesn't exist, the account is invalid/deleted
        if type(comment.author) != type(None):
            author = comment.author.name
        else:
            author = "[deleted]"

        # Assume that whenever the user changes, they are countering the previous person
        if author != last_author:
            is_pro_argument = !is_pro_argument

        if author == "[deleted]" or author=="DeltaBot":
            #print("Skipping comment...")
            continue

        # Remove meta and duplicate comments
        comment.body = " ".join([line for line in comment.body.splitlines()
                                  if not re.search(r"(?i)(Change\smy\sview|CMV)", line)
                                  and line not in pros # Why doesn't this line work
                                  ])

        # Sometimes for some reason duplicate entries exist
        # Also remove automated message with "Δ" in it

        if comment.body in pros:
            #print("Skipping duplicate entry")
            continue

        #print("\t\t>>\t",comment.body)

            # Remove toxic comments
        if Detoxify("multilingual").predict(comment.body)["toxicity"] > TOXIC_THRESHOLD:
            #print("Identified toxic comment, ignoring...")
            comment.body = ""

        # Add to the respective argument type        
        if is_pro_argument:
            pros.append(comment.body)
        
        last_author = comment.author.name
        
        # Pros = arguments for the Title of this post
        # Cons = arguments against the title of this post

        pros.append(comment.body)
    return pros
python
print(f"Loading in {ENTRIES_COUNT} posts")
dataset = df.head(ENTRIES_COUNT)

python
# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name

import warnings
warnings.filterwarnings('ignore')

dataset["post_id"] = dataset["name"]
warnings.filterwarnings('default')
python
%%time

from tqdm.auto import tqdm
# Reset variables for if we run this multiple times
all_pros = []
all_names = []
all_titles = []
all_sources = []

print("Loading in data... This will take a while.")

for i in tqdm(range(dataset.shape[0])):

    post = dataset.iloc[i]
    modified_title = post.title.replace('CMV', "Change my mind")
    #print(f"\n Loading entry {i+1}/{dataset.shape[0]}:\n\t\"{modified_title}\"")

    if type(post) == type(None):
        continue

    assert(post.post_id != i)

    pros = get_top_comment_and_clean_data(post.post_id)

    if post.title == "[deleted]":
        continue

    pros = " ".join([*set(pros)])
    pros = pros.replace("[deleted]","")

    post.selftext = cleanup_body_text(post.selftext)
    all_titles.append(modified_title + " " + post.selftext)
    all_pros.append(pros)
    all_names.append(post.name)
    all_sources.append(f"https://reddit.com/r/changemyview/comments/{post.post_id}")
    #print(post.title)



python
all_pros[1]
python
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
    "INSTRUCTION": all_titles,
    "RESPONSE": all_pros,
    "SOURCE": all_sources
}, index=all_names
)
python
# Create Apache Paquete file

import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")
python
# Test to see if it was successful
table = pq.read_table("output.parquet")
table.to_pandas()