examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q git+https://github.com/huggingface/accelerate.git@main
!pip install huggingface_hub
!pip install bitsandbytes
!pip install SentencePiece
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # force using CUDA device 0
os.environ["ZE_AFFINITY_MASK"] = "0" # force using Intel XPU device 0
from huggingface_hub import notebook_login
notebook_login()
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto", use_auth_token=True)
%%time
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b", adapter_name="eng_alpaca")
%%time
model.load_adapter("22h/cabrita-lora-v0-1", adapter_name="portuguese_alpaca")
model
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
model.to(device)
def generate_prompt(instruction, input=None):
if input:
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
def evaluate(
instruction,
input=None,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=256,
**kwargs,
):
prompt = generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
no_repeat_ngram_size=3,
**kwargs,
)
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
return output.split("### Response:")[1].strip()
%%time
model.set_adapter("eng_alpaca")
instruction = "Tell me about alpacas."
print(evaluate(instruction))
%%time
model.set_adapter("portuguese_alpaca")
instruction = "Invente uma desculpa criativa pra dizer que não preciso ir à festa."
print(evaluate(instruction))
with model.disable_adapter():
instruction = "Invente uma desculpa criativa pra dizer que não preciso ir à festa."
print(evaluate(instruction))