examples/evaluation/lora-lm-eval.ipynb
In this notebook, we are going to learn how to evaluate the finetuned lora model on the hellaswag task using lm-eval-harness toolkit.
# Install LM-Eval
!pip install -q datasets evaluate lm_eval
import torch
import lm_eval
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
output = lm_eval.simple_evaluate(model = 'hf',
model_args = {
'pretrained' : 'bert-base-cased',
'dtype' : 'bfloat16'},
tasks = 'hellaswag',
device = device,
batch_size = 128,
log_samples = False)
output["results"]
# Import necessary libraries
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, TaskType, get_peft_model
# Configure LoRA for Sequence Classification
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS, # Set task type to sequence classification
target_modules=["query", "key"] # Specify target modules for LoRA tuning
)
# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
'bert-base-cased',
num_labels = 2
)
# Wrap the model with LoRA configuration
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# load the dataset
dataset = load_dataset("imdb")
def tokenize_function(row):
return tokenizer(row["text"], padding="max_length", truncation = True)
tokenized_datasets = dataset.map(tokenize_function, batched = True)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]
# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")
return metric.compute(predictions = predictions, references = labels)
# Configure training arguments
training_args = TrainingArguments("bert-lora-imdb",
eval_strategy="epoch",
per_device_train_batch_size=32, # decrease this for OOM error
per_device_eval_batch_size=64,
save_strategy="epoch",
learning_rate=2e-3,
num_train_epochs=5,
weight_decay=0.01,
load_best_model_at_end=True,
do_eval=True,
do_predict=True,
metric_for_best_model="accuracy",
report_to="none")
# Initialize the Trainer for the model training loop
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
#start training
trainer.train()
# use the path of your checkpoint here
output = lm_eval.simple_evaluate(model = 'hf',
model_args = {
'pretrained' : 'bert-base-cased',
'peft' : './bert-lora-imdb/checkpoint-3910',
'dtype' : 'bfloat16'},
tasks = 'hellaswag',
device = device,
batch_size = 128,
log_samples = False)
output["results"]