examples/jupyter/integrations/huggingface.ipynb
import modin.pandas as pd
import numpy as np # linear algebra
import tensorflow as tf
import sklearn
from tqdm import tqdm
import urllib.request
url_path = "https://modin-datasets.intel.com/testing/IMDB_Dataset.csv"
urllib.request.urlretrieve(url_path, "imdb.csv")
%%time
modin_df = pd.read_csv("imdb.csv")
modin_df.head()
type(modin_df)
modin_df.sample()
from transformers import BertTokenizer, TFBertForSequenceClassification
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import InputExample, InputFeatures
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.summary()
# changing positive and negative into numeric values
def cat2num(value):
if value=='positive':
return 1
else:
return 0
modin_df['sentiment'] = modin_df['sentiment'].apply(cat2num)
train = modin_df[:45000]
test = modin_df[45000:]
# But first see BERT tokenizer exmaples and other required stuff!
example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)
type(train)
def convert_data_to_examples(train, test, review, sentiment):
train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[review],
label = x[sentiment]), axis = 1)
validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[review],
label = x[sentiment]), axis = 1,)
return train_InputExamples, validation_InputExamples
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, 'review', 'sentiment')
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
features = [] # -> will hold InputFeatures to be converted later
for e in tqdm(examples):
input_dict = tokenizer.encode_plus(
e.text_a,
add_special_tokens=True, # Add 'CLS' and 'SEP'
max_length=max_length, # truncates if len(s) > max_length
return_token_type_ids=True,
return_attention_mask=True,
pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
truncation=True
)
input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )
def gen():
for f in features:
yield (
{
"input_ids": f.input_ids,
"attention_mask": f.attention_mask,
"token_type_ids": f.token_type_ids,
},
f.label,
)
return tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([]),
),
)
DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'
train_InputExamples
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
model.fit(train_data, epochs=2, validation_data=validation_data)
pred_sentences = ['worst movie of my life, will never watch movies from this series',
'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf') # we are tokenizing before sending into our trained model
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": ", labels[label[i]])