Lora Lm Eval - Peft — ContextQMD

Peft model evaluation using lm-eval-harness

In this notebook, we are going to learn how to evaluate the finetuned lora model on the hellaswag task using lm-eval-harness toolkit.

python

# Install LM-Eval
!pip install -q datasets evaluate lm_eval

First we will check the accuracy score on the hellaswag task for the base bert without finetuning

python

import torch
import lm_eval


device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
output = lm_eval.simple_evaluate(model = 'hf',
                        model_args = {
                            'pretrained' : 'bert-base-cased',
                            'dtype' : 'bfloat16'},
                        tasks = 'hellaswag',
                        device = device,
                        batch_size = 128,
                        log_samples = False)
output["results"]

Now lets try to finetune the bert on the imdb dataset (this is for demonstration and finetuning on imdb may not increase the scores on hellaswag task)

python

# Import necessary libraries
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

from peft import LoraConfig, TaskType, get_peft_model

python

# Configure LoRA for Sequence Classification
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,        # Set task type to sequence classification
    target_modules=["query", "key"]    # Specify target modules for LoRA tuning
)

# Initialize the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels = 2
)

# Wrap the model with LoRA configuration
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

python

# load the dataset
dataset = load_dataset("imdb")

def tokenize_function(row):
    return tokenizer(row["text"], padding="max_length", truncation = True)

tokenized_datasets = dataset.map(tokenize_function, batched = True)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

python

# Define a function to compute evaluation metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load("accuracy")
    return metric.compute(predictions = predictions, references = labels)

python

# Configure training arguments
training_args = TrainingArguments("bert-lora-imdb",
    eval_strategy="epoch",
    per_device_train_batch_size=32, # decrease this for OOM error
    per_device_eval_batch_size=64,
    save_strategy="epoch",
    learning_rate=2e-3,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    do_eval=True,
    do_predict=True,
    metric_for_best_model="accuracy",
    report_to="none")

# Initialize the Trainer for the model training loop
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

#start training
trainer.train()

Now take the finetuned lora checkpoint and check the accuracy score on hellaswag task.

python

# use the path of your checkpoint here
output = lm_eval.simple_evaluate(model = 'hf',
                        model_args = {
                          'pretrained' : 'bert-base-cased',
                          'peft' : './bert-lora-imdb/checkpoint-3910',
                          'dtype' : 'bfloat16'},
                        tasks = 'hellaswag',
                        device = device,
                        batch_size = 128,
                        log_samples = False)

output["results"]