docs/examples/vector_stores/SimpleIndexDemoLlama2.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/vector_stores/SimpleIndexDemoLlama2.ipynb" target="_parent"></a>
This notebook walks through the proper setup to use llama-2 with LlamaIndex. Specifically, we look at using a vector store index.
If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
%pip install llama-index-llms-replicate
!pip install llama-index
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["REPLICATE_API_TOKEN"] = "YOUR_REPLICATE_TOKEN"
# Optional logging
# import logging
# import sys
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from IPython.display import Markdown, display
from llama_index.llms.replicate import Replicate
from llama_index.core.llms.llama_utils import (
messages_to_prompt,
completion_to_prompt,
)
# The replicate endpoint
LLAMA_13B_V2_CHAT = "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
# inject custom system prompt into llama-2
def custom_completion_to_prompt(completion: str) -> str:
return completion_to_prompt(
completion,
system_prompt=(
"You are a Q&A assistant. Your goal is to answer questions as "
"accurately as possible is the instructions and context provided."
),
)
llm = Replicate(
model=LLAMA_13B_V2_CHAT,
temperature=0.01,
# override max tokens since it's interpreted
# as context window instead of max tokens
context_window=4096,
# override completion representation for llama 2
completion_to_prompt=custom_completion_to_prompt,
# if using llama 2 for data agents, also override the message representation
messages_to_prompt=messages_to_prompt,
)
from llama_index.core import Settings
Settings.llm = llm
Download Data
# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
index = VectorStoreIndex.from_documents(documents)
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("What happened at interleaf?")
for token in response.response_gen:
print(token, end="")