Demonstrating Hugging Face Modin Interoperability

All the examples in this section are taken/ adapted from https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook

python

import modin.pandas as pd
import numpy as np # linear algebra

python

import tensorflow as tf
import sklearn
from tqdm import tqdm

python

import urllib.request
url_path = "https://modin-datasets.intel.com/testing/IMDB_Dataset.csv"
urllib.request.urlretrieve(url_path, "imdb.csv")

python

%%time
modin_df = pd.read_csv("imdb.csv")

python

modin_df.head()

python

type(modin_df)

python

modin_df.sample()

python

from transformers import BertTokenizer, TFBertForSequenceClassification

python

# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

python

model.summary()

python

# changing positive and negative into numeric values

def cat2num(value):
    if value=='positive': 
        return 1
    else: 
        return 0
    
modin_df['sentiment']  =  modin_df['sentiment'].apply(cat2num)
train = modin_df[:45000]
test = modin_df[45000:]

python

# But first see BERT tokenizer exmaples and other required stuff!

example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

python

type(train)

python

def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1,)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')

python

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

python

train_InputExamples

python

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

python

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

python

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

python

model.fit(train_data, epochs=2, validation_data=validation_data)

python

pred_sentences = ['worst movie of my life, will never watch movies from this series', 
                  'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']

python

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model
tf_outputs = model(tf_batch)                                  
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])