Fine-tuning Llama3-8b on timdettmers/openassistant-guanaco Dataset using QRandLora (quantized RandLora) on T4 Free Colab GPU.

python

# Install the libraries
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

python

# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace
from huggingface_hub import notebook_login

notebook_login()

Loading the model and its tokenizer in quantized setup!

python

# setting up the config for 4-bit quantization of QRandLora
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Meta-Llama-3-8B"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})

python

print(model)

Prepare model for PEFT fine-tuning

python

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

python

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Setup `RandLoraConfig`

python

from peft import RandLoraConfig, get_peft_model

config = RandLoraConfig(
    r=32,
    randlora_alpha=640,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # parameters specific to llama
    randlora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

python

print(model)

Step 2) Fine-tuning process 💥

python

# Load the dataset from HF
from datasets import load_dataset

data = load_dataset("timdettmers/openassistant-guanaco")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Training

For the sake of the demo, we just ran it for 10 steps just to showcase how to use this integration with existing tools on the HF ecosystem.

python

import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="path/to/your/HF/repo",  # change it to your desired repo!
        optim="paged_adamw_8bit",
        label_names=["labels"],
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Usage Example

python

model.config.use_cache = True
model.eval();

python

from transformers import GenerationConfig

max_new_tokens = 120
top_p = 0.9
temperature = 0.7
user_question = "What is the purpose of quantization in LLMs?"


prompt = (
    "A chat between a curious human and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the user's questions. "
    "### Human: {user_question}"
    "### Assistant: "
)


def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):
    device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
    inputs = tokenizer(prompt.format(user_question=user_question), return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        generation_config=GenerationConfig(
            do_sample=True,
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            temperature=temperature,
        ),
    )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(text)
    return text


generate(model, user_question)

python

# trainer.push_to_hub()

Fine-tuning Llama3-8b on timdettmers/openassistant-guanaco Dataset using QRandLora (quantized RandLora) on T4 Free Colab GPU.

Fine-tuning Llama3-8b on timdettmers/openassistant-guanaco Dataset using QRandLora (quantized RandLora) on T4 Free Colab GPU.

Loading the model and its tokenizer in quantized setup!

Prepare model for PEFT fine-tuning

Setup RandLoraConfig

Step 2) Fine-tuning process 💥

Training

Usage Example

Setup `RandLoraConfig`