data/datasets/tlcv2.0_oa/tlcv2_0_oa.ipynb
!wget https://github.com/jitkapat/thailitcorpus/releases/download/v.2.0/tlc_v.2.0.tar.gz
!tar -xf tlc_v.2.0.tar.gz
import glob
data = []
for i in glob.glob("*.json"):
with open(i, "r", encoding="utf-8-sig") as f:
data.extend(eval(f.read()))
from datasets import load_dataset
import pandas as pd
df = pd.DataFrame.from_dict(
{
"TEXT": ["\n".join(["".join(i) for i in j["text"]]) for j in data],
"SOURCE": ["tlc v.2.0" for i in range(0, len(data))],
"METADATA": [{"ch_num": j["ch_num"], "title": j["title"]} for j in data],
}
)
df
df.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow")
from datasets import Dataset
ds = Dataset.from_parquet("dataset.parquet")
ds
ds.push_to_hub("pythainlp/tlcv2.0_oa")