Back to Open Assistant

Tlcv2 0 Oa

data/datasets/tlcv2.0_oa/tlcv2_0_oa.ipynb

0.0.1910 B
Original Source
python
!wget https://github.com/jitkapat/thailitcorpus/releases/download/v.2.0/tlc_v.2.0.tar.gz
python
!tar -xf tlc_v.2.0.tar.gz
python
import glob

data = []
for i in glob.glob("*.json"):
    with open(i, "r", encoding="utf-8-sig") as f:
        data.extend(eval(f.read()))
python
from datasets import load_dataset
import pandas as pd
python
df = pd.DataFrame.from_dict(
    {
        "TEXT": ["\n".join(["".join(i) for i in j["text"]]) for j in data],
        "SOURCE": ["tlc v.2.0" for i in range(0, len(data))],
        "METADATA": [{"ch_num": j["ch_num"], "title": j["title"]} for j in data],
    }
)
python
df
python
df.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow")
python
from datasets import Dataset

ds = Dataset.from_parquet("dataset.parquet")
python
ds
python
ds.push_to_hub("pythainlp/tlcv2.0_oa")