docs/examples/retrievers/multi_doc_together_hybrid.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/embeddings/together.ipynb" target="_parent"></a>
This notebook shows how to use long-context together.ai embedding models for advanced RAG. We index each document by running the embedding model over the entire document text, as well as embedding each chunk. We then define a custom retriever that can compute both node similarity as well as document similarity.
Visit https://together.ai and sign up to get an API key.
We load in our documentation. For the sake of speed we load in just 10 pages, but of course if you want to stress test your model you should load in all of it.
%pip install llama-index-embeddings-together
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-readers-file
domain = "docs.llamaindex.ai"
docs_url = "https://docs.llamaindex.ai/en/latest/"
!wget -e robots=off --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains {domain} --no-parent {docs_url}
from llama_index.readers.file import UnstructuredReader
from pathlib import Path
from llama_index.llms.openai import OpenAI
from llama_index.core import Document
reader = UnstructuredReader()
# all_files_gen = Path("./docs.llamaindex.ai/").rglob("*")
# all_files = [f.resolve() for f in all_files_gen]
# all_html_files = [f for f in all_files if f.suffix.lower() == ".html"]
# curate a subset
all_html_files = [
"docs.llamaindex.ai/en/latest/index.html",
"docs.llamaindex.ai/en/latest/contributing/contributing.html",
"docs.llamaindex.ai/en/latest/understanding/understanding.html",
"docs.llamaindex.ai/en/latest/understanding/using_llms/using_llms.html",
"docs.llamaindex.ai/en/latest/understanding/using_llms/privacy.html",
"docs.llamaindex.ai/en/latest/understanding/loading/llamahub.html",
"docs.llamaindex.ai/en/latest/optimizing/production_rag.html",
"docs.llamaindex.ai/en/latest/module_guides/models/llms.html",
]
# TODO: set to higher value if you want more docs
doc_limit = 10
docs = []
for idx, f in enumerate(all_html_files):
if idx > doc_limit:
break
print(f"Idx {idx}/{len(all_html_files)}")
loaded_docs = reader.load_data(file=f, split_documents=True)
# Hardcoded Index. Everything before this is ToC for all pages
# Adjust this start_idx to suit your needs
start_idx = 64
loaded_doc = Document(
id_=str(f),
text="\n\n".join([d.get_content() for d in loaded_docs[start_idx:]]),
metadata={"path": str(f)},
)
print(str(f))
docs.append(loaded_doc)
Define a custom retriever that does the following:
This is essentially vector retrieval with a reranking step that reweights the node similarities.
# You can set the API key in the embeddings or env
# import os
# os.environ["TOEGETHER_API_KEY"] = "your-api-key"
from llama_index.embeddings.together import TogetherEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
api_key = "<api_key>"
embed_model = TogetherEmbedding(
model_name="togethercomputer/m2-bert-80M-32k-retrieval", api_key=api_key
)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Create docstore for original documents. Embed each document, and put in docstore.
We will refer to this later in our hybrid retrieval algorithm!
from llama_index.core.storage.docstore import SimpleDocumentStore
for doc in docs:
embedding = embed_model.get_text_embedding(doc.get_content())
doc.embedding = embedding
docstore = SimpleDocumentStore()
docstore.add_documents(docs)
Let's build the vector index of chunks. Each chunk will also have a reference to its source document through its index_id (which can then be used to lookup the source document in the docstore).
from llama_index.core.schema import IndexNode
from llama_index.core import (
load_index_from_storage,
StorageContext,
VectorStoreIndex,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SummaryIndex
from llama_index.core.retrievers import RecursiveRetriever
import os
from tqdm.notebook import tqdm
import pickle
def build_index(docs, out_path: str = "storage/chunk_index"):
nodes = []
splitter = SentenceSplitter(chunk_size=512, chunk_overlap=70)
for idx, doc in enumerate(tqdm(docs)):
# print('Splitting: ' + str(idx))
cur_nodes = splitter.get_nodes_from_documents([doc])
for cur_node in cur_nodes:
# ID will be base + parent
file_path = doc.metadata["path"]
new_node = IndexNode(
text=cur_node.text or "None",
index_id=str(file_path),
metadata=doc.metadata
# obj=doc
)
nodes.append(new_node)
print("num nodes: " + str(len(nodes)))
# save index to disk
if not os.path.exists(out_path):
index = VectorStoreIndex(nodes, embed_model=embed_model)
index.set_index_id("simple_index")
index.storage_context.persist(f"./{out_path}")
else:
# rebuild storage context
storage_context = StorageContext.from_defaults(
persist_dir=f"./{out_path}"
)
# load index
index = load_index_from_storage(
storage_context, index_id="simple_index", embed_model=embed_model
)
return index
index = build_index(docs)
We define a hybrid retriever that can first fetch chunks by vector similarity, and then reweight it based on similarity with the parent document (using an alpha parameter).
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.indices.query.embedding_utils import get_top_k_embeddings
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from typing import List, Any, Optional
class HybridRetriever(BaseRetriever):
"""Hybrid retriever."""
def __init__(
self,
vector_index,
docstore,
similarity_top_k: int = 2,
out_top_k: Optional[int] = None,
alpha: float = 0.5,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(**kwargs)
self._vector_index = vector_index
self._embed_model = vector_index._embed_model
self._retriever = vector_index.as_retriever(
similarity_top_k=similarity_top_k
)
self._out_top_k = out_top_k or similarity_top_k
self._docstore = docstore
self._alpha = alpha
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
"""Retrieve nodes given query."""
# first retrieve chunks
nodes = self._retriever.retrieve(query_bundle.query_str)
# get documents, and embedding similiaryt between query and documents
## get doc embeddings
docs = [self._docstore.get_document(n.node.index_id) for n in nodes]
doc_embeddings = [d.embedding for d in docs]
query_embedding = self._embed_model.get_query_embedding(
query_bundle.query_str
)
## compute doc similarities
doc_similarities, doc_idxs = get_top_k_embeddings(
query_embedding, doc_embeddings
)
## compute final similarity with doc similarities and original node similarity
result_tups = []
for doc_idx, doc_similarity in zip(doc_idxs, doc_similarities):
node = nodes[doc_idx]
# weight alpha * node similarity + (1-alpha) * doc similarity
full_similarity = (self._alpha * node.score) + (
(1 - self._alpha) * doc_similarity
)
print(
f"Doc {doc_idx} (node score, doc similarity, full similarity): {(node.score, doc_similarity, full_similarity)}"
)
result_tups.append((full_similarity, node))
result_tups = sorted(result_tups, key=lambda x: x[0], reverse=True)
# update scores
for full_score, node in result_tups:
node.score = full_score
return [n for _, n in result_tups][:out_top_k]
top_k = 10
out_top_k = 3
hybrid_retriever = HybridRetriever(
index, docstore, similarity_top_k=top_k, out_top_k=3, alpha=0.5
)
base_retriever = index.as_retriever(similarity_top_k=out_top_k)
def show_nodes(nodes, out_len: int = 200):
for idx, n in enumerate(nodes):
print(f"\n\n >>>>>>>>>>>> ID {n.id_}: {n.metadata['path']}")
print(n.get_content()[:out_len])
query_str = "Tell me more about the LLM interface and where they're used"
nodes = hybrid_retriever.retrieve(query_str)
show_nodes(nodes)
base_nodes = base_retriever.retrieve(query_str)
show_nodes(base_nodes)
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine(hybrid_retriever)
base_query_engine = index.as_query_engine(similarity_top_k=out_top_k)
response = query_engine.query(query_str)
print(str(response))
base_response = base_query_engine.query(query_str)
print(str(base_response))