Back to Tensorzero

Remove duplicates

examples/optimization/dicl/data/clean_data.ipynb

2026.4.11.1 KB
Original Source
python
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
python
INPUT_FILE = Path("raw_data.tsv")
OUTPUT_FILE = Path("clean_data.csv")

assert INPUT_FILE.exists()
python
df_input = pd.read_csv(INPUT_FILE, sep="\t", header=None)
df_input.columns = ["class", "text"]
df_input["class"] = df_input["class"].map({"ham": 0, "spam": 1})
python
# Remove duplicates

print("Rows before dropping duplicates:", len(df_input))
df_input = df_input.drop_duplicates(subset=["text"])
print("Rows after dropping duplicates:", len(df_input))
python
# Create an 80/20 train/test split

train_idx, test_idx = train_test_split(df_input.index, test_size=0.2, random_state=42, stratify=df_input["class"])
df_input["is_train"] = 0
df_input.loc[train_idx, "is_train"] = 1
df_input = df_input[["is_train", "class", "text"]].sort_values(by="is_train")

print("Train set size:", (df_input["is_train"] == 1).sum())
print("Test set size:", (df_input["is_train"] == 0).sum())
python
df_input[["is_train", "class", "text"]].to_csv(OUTPUT_FILE, index=False)