docs/examples/customization/llms/SimpleIndexDemo-Huggingface_stablelm.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/customization/llms/SimpleIndexDemo-Huggingface_stablelm.ipynb" target="_parent"></a>
If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
%pip install llama-index-llms-huggingface
!pip install llama-index
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
# load documents
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate
system_prompt = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
"""
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")
import torch
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={"temperature": 0.7, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="StabilityAI/stablelm-tuned-alpha-3b",
model_name="StabilityAI/stablelm-tuned-alpha-3b",
device_map="auto",
stopping_ids=[50278, 50279, 50277, 1, 0],
tokenizer_kwargs={"max_length": 4096},
# uncomment this if using CUDA to reduce memory usage
# model_kwargs={"torch_dtype": torch.float16}
)
Settings.llm = llm
Settings.chunk_size = 1024
index = VectorStoreIndex.from_documents(documents)
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(response)
query_engine = index.as_query_engine(streaming=True)
# set Logging to DEBUG for more detailed outputs
response_stream = query_engine.query("What did the author do growing up?")
# can be slower to start streaming since llama-index often involves many LLM calls
response_stream.print_response_stream()
# can also get a normal response object
response = response_stream.get_response()
print(response)
# can also iterate over the generator yourself
generated_text = ""
for text in response.response_gen:
generated_text += text