examples/randlora_finetuning/qrandlora_finetuning.ipynb
# Install the libraries
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace
from huggingface_hub import notebook_login
notebook_login()
# setting up the config for 4-bit quantization of QRandLora
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_id = "meta-llama/Meta-Llama-3-8B"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
print(model)
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
RandLoraConfigfrom peft import RandLoraConfig, get_peft_model
config = RandLoraConfig(
r=32,
randlora_alpha=640,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
], # parameters specific to llama
randlora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
print(model)
# Load the dataset from HF
from datasets import load_dataset
data = load_dataset("timdettmers/openassistant-guanaco")
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)
For the sake of the demo, we just ran it for 10 steps just to showcase how to use this integration with existing tools on the HF ecosystem.
import transformers
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=10,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
output_dir="path/to/your/HF/repo", # change it to your desired repo!
optim="paged_adamw_8bit",
label_names=["labels"],
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
model.config.use_cache = True
model.eval();
from transformers import GenerationConfig
max_new_tokens = 120
top_p = 0.9
temperature = 0.7
user_question = "What is the purpose of quantization in LLMs?"
prompt = (
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions. "
"### Human: {user_question}"
"### Assistant: "
)
def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
inputs = tokenizer(prompt.format(user_question=user_question), return_tensors="pt").to(device)
outputs = model.generate(
**inputs,
generation_config=GenerationConfig(
do_sample=True,
max_new_tokens=max_new_tokens,
top_p=top_p,
temperature=temperature,
),
)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(text)
return text
generate(model, user_question)
# trainer.push_to_hub()