Back to Llama Index

Llama3 Cookbook

docs/examples/cookbooks/llama3_cookbook.ipynb

0.14.217.3 KB
Original Source

Llama3 Cookbook

Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes. The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks.

In this notebook, we will demonstrate how to use Llama3 with LlamaIndex. Here, we use Llama-3-8B-Instruct for the demonstration."

Installation

python
!pip install llama-index
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-huggingface
!pip install llama-index-embeddings-huggingface-api

To use llama3 from the official repo, you'll need to authorize your huggingface account and use your huggingface token.

python
hf_token = "hf_"

Setup Tokenizer and Stopping ids

python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token,
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

Setup LLM using HuggingFaceLLM

python
# generate_kwargs parameters are taken from https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
# import torch
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

llm = HuggingFaceLLM(
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)
python
## You can deploy the model on HF Inference Endpoint and use it

# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# llm = HuggingFaceInferenceAPI(
#     model_name="<HF Inference Endpoint>",
#     token='<HF Token>'
# )

Call complete with a prompt

python
response = llm.complete("Who is Paul Graham?")

print(response)

Call chat with a list of messages

python
from llama_index.core.llms import ChatMessage

messages = [
    ChatMessage(role="system", content="You are CEO of MetaAI"),
    ChatMessage(role="user", content="Introduce Llama3 to the world."),
]
response = llm.chat(messages)
python
print(response)

Let's build RAG pipeline with Llama3

Download Data

python
!wget "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt" "paul_graham_essay.txt"

Load Data

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["paul_graham_essay.txt"]
).load_data()

Setup Embedding Model

python
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Set Default LLM and Embedding Model

python
from llama_index.core import Settings

# bge embedding model
Settings.embed_model = embed_model

# Llama-3-8B-Instruct model
Settings.llm = llm

Create Index

python
index = VectorStoreIndex.from_documents(
    documents,
)

Create QueryEngine

python
query_engine = index.as_query_engine(similarity_top_k=3)

Querying

python
response = query_engine.query("What did paul graham do growing up?")
python
print(response)

Agents And Tools

python
import json
from typing import Sequence, List

from llama_index.core.llms import ChatMessage
from llama_index.core.tools import BaseTool, FunctionTool
from llama_index.core.agent import ReActAgent

import nest_asyncio

nest_asyncio.apply()

Define Tools

python
def multiply(a: int, b: int) -> int:
    """Multiple two integers and returns the result integer"""
    return a * b


def add(a: int, b: int) -> int:
    """Add two integers and returns the result integer"""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two integers and returns the result integer"""
    return a - b


def divide(a: int, b: int) -> int:
    """Divides two integers and returns the result integer"""
    return a / b


multiply_tool = FunctionTool.from_defaults(fn=multiply)
add_tool = FunctionTool.from_defaults(fn=add)
subtract_tool = FunctionTool.from_defaults(fn=subtract)
divide_tool = FunctionTool.from_defaults(fn=divide)

ReAct Agent

python
agent = ReActAgent.from_tools(
    [multiply_tool, add_tool, subtract_tool, divide_tool],
    llm=llm,
    verbose=True,
)

Querying

python
response = agent.chat("What is (121 + 2) * 5?")
print(str(response))
python
response = agent.chat("What is (100/5)*2-5+10 ?")
print(str(response))

ReAct Agent With RAG QueryEngine Tools

python
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)

from llama_index.core.tools import QueryEngineTool, ToolMetadata

Download Data

python
!mkdir -p 'data/10k/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10k/lyft_2021.pdf'

Load Data

python
lyft_docs = SimpleDirectoryReader(
    input_files=["./data/10k/lyft_2021.pdf"]
).load_data()
uber_docs = SimpleDirectoryReader(
    input_files=["./data/10k/uber_2021.pdf"]
).load_data()

Create Indices

python
lyft_index = VectorStoreIndex.from_documents(lyft_docs)
uber_index = VectorStoreIndex.from_documents(uber_docs)

Create QueryEngines

python
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)
uber_engine = uber_index.as_query_engine(similarity_top_k=3)

Define QueryEngine Tools

python
query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
]

Create ReAct Agent using RAG QueryEngine Tools

python
agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=llm,
    verbose=True,
)

Querying

python
response = agent.chat("What was Lyft's revenue in 2021?")
print(str(response))
python
response = agent.chat("What was Uber's revenue in 2021?")
print(str(response))