Comparing Methods for Structured Retrieval (Auto-Retrieval vs. Recursive Retrieval)

In a naive RAG system, the set of input documents are then chunked, embedded, and dumped to a vector database collection. Retrieval would just fetch the top-k documents by embedding similarity.

This can fail if the set of documents is large - it can be hard to disambiguate raw chunks, and you're not guaranteed to filter for the set of documents that contain relevant context.

In this guide we explore structured retrieval - more advanced query algorithms that take advantage of structure within your documents for higher-precision retrieval. We compare the following two methods:

Metadata Filters + Auto-Retrieval: Tag each document with the right set of metadata. During query-time, use auto-retrieval to infer metadata filters along with passing through the query string for semantic search.
Store Document Hierarchies (summaries -> raw chunks) + Recursive Retrieval: Embed document summaries and map that to the set of raw chunks for each document. During query-time, do recursive retrieval to first fetch summaries before fetching documents.

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

python

%pip install llama-index-llms-openai
%pip install llama-index-vector-stores-weaviate

python

!pip install llama-index

python

import nest_asyncio

nest_asyncio.apply()

python

import logging
import sys
from llama_index.core import SimpleDirectoryReader
from llama_index.core import SummaryIndex

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

python

wiki_titles = ["Michael Jordan", "Elon Musk", "Richard Branson", "Rihanna"]
wiki_metadatas = {
    "Michael Jordan": {
        "category": "Sports",
        "country": "United States",
    },
    "Elon Musk": {
        "category": "Business",
        "country": "United States",
    },
    "Richard Branson": {
        "category": "Business",
        "country": "UK",
    },
    "Rihanna": {
        "category": "Music",
        "country": "Barbados",
    },
}

python

from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

python

# Load all wiki documents
docs_dict = {}
for wiki_title in wiki_titles:
    doc = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()[0]

    doc.metadata.update(wiki_metadatas[wiki_title])
    docs_dict[wiki_title] = doc

python

from llama_index.llms.openai import OpenAI
from llama_index.core.callbacks import LlamaDebugHandler, CallbackManager
from llama_index.core.node_parser import SentenceSplitter


llm = OpenAI("gpt-4")
callback_manager = CallbackManager([LlamaDebugHandler()])
splitter = SentenceSplitter(chunk_size=256)

Metadata Filters + Auto-Retrieval

In this approach, we tag each Document with metadata (category, country), and store in a Weaviate vector db.

During retrieval-time, we then perform "auto-retrieval" to infer the relevant set of metadata filters.

python

## Setup Weaviate
import weaviate

# cloud
auth_config = weaviate.AuthApiKey(api_key="<api_key>")
client = weaviate.Client(
    "https://llama-index-test-v0oggsoz.weaviate.network",
    auth_client_secret=auth_config,
)

python

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from IPython.display import Markdown, display

python

# drop items from collection first
client.schema.delete_class("LlamaIndex")

python

from llama_index.core import StorageContext

# If you want to load the index later, be sure to give it a name!
vector_store = WeaviateVectorStore(
    weaviate_client=client, index_name="LlamaIndex"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# NOTE: you may also choose to define a index_name manually.
# index_name = "test_prefix"
# vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name)

python

# validate that the schema was created
class_schema = client.schema.get("LlamaIndex")
display(class_schema)

python

index = VectorStoreIndex(
    [],
    storage_context=storage_context,
    transformations=[splitter],
    callback_manager=callback_manager,
)

# add documents to index
for wiki_title in wiki_titles:
    index.insert(docs_dict[wiki_title])

python

from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo


vector_store_info = VectorStoreInfo(
    content_info="brief biography of celebrities",
    metadata_info=[
        MetadataInfo(
            name="category",
            type="str",
            description=(
                "Category of the celebrity, one of [Sports, Entertainment,"
                " Business, Music]"
            ),
        ),
        MetadataInfo(
            name="country",
            type="str",
            description=(
                "Country of the celebrity, one of [United States, Barbados,"
                " Portugal]"
            ),
        ),
    ],
)
retriever = VectorIndexAutoRetriever(
    index,
    vector_store_info=vector_store_info,
    llm=llm,
    callback_manager=callback_manager,
    max_top_k=10000,
)

python

# NOTE: the "set top-k to 10000" is a hack to return all data.
# Right now auto-retrieval will always return a fixed top-k, there's a TODO to allow it to be None
# to fetch all data.
# So it's theoretically possible to have the LLM infer a None top-k value.
nodes = retriever.retrieve(
    "Tell me about a celebrity from the United States, set top k to 10000"
)

python

print(f"Number of nodes: {len(nodes)}")
for node in nodes[:10]:
    print(node.node.get_content())

python

nodes = retriever.retrieve(
    "Tell me about the childhood of a popular sports celebrity in the United"
    " States"
)
for node in nodes:
    print(node.node.get_content())

python

nodes = retriever.retrieve(
    "Tell me about the college life of a billionaire who started at company at"
    " the age of 16"
)
for node in nodes:
    print(node.node.get_content())

python

nodes = retriever.retrieve("Tell me about the childhood of a UK billionaire")
for node in nodes:
    print(node.node.get_content())

Build Recursive Retriever over Document Summaries

python

from llama_index.core.schema import IndexNode

python

# define top-level nodes and vector retrievers
nodes = []
vector_query_engines = {}
vector_retrievers = {}

for wiki_title in wiki_titles:
    # build vector index
    vector_index = VectorStoreIndex.from_documents(
        [docs_dict[wiki_title]],
        transformations=[splitter],
        callback_manager=callback_manager,
    )
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=llm)
    vector_query_engines[wiki_title] = vector_query_engine
    vector_retrievers[wiki_title] = vector_index.as_retriever()

    # save summaries
    out_path = Path("summaries") / f"{wiki_title}.txt"
    if not out_path.exists():
        # use LLM-generated summary
        summary_index = SummaryIndex.from_documents(
            [docs_dict[wiki_title]], callback_manager=callback_manager
        )

        summarizer = summary_index.as_query_engine(
            response_mode="tree_summarize", llm=llm
        )
        response = await summarizer.aquery(
            f"Give me a summary of {wiki_title}"
        )

        wiki_summary = response.response
        Path("summaries").mkdir(exist_ok=True)
        with open(out_path, "w") as fp:
            fp.write(wiki_summary)
    else:
        with open(out_path, "r") as fp:
            wiki_summary = fp.read()

    print(f"**Summary for {wiki_title}: {wiki_summary}")
    node = IndexNode(text=wiki_summary, index_id=wiki_title)
    nodes.append(node)

python

# define top-level retriever
top_vector_index = VectorStoreIndex(
    nodes, transformations=[splitter], callback_manager=callback_manager
)
top_vector_retriever = top_vector_index.as_retriever(similarity_top_k=1)

python

# define recursive retriever
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

python

# note: can pass `agents` dict as `query_engine_dict` since every agent can be used as a query engine
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": top_vector_retriever, **vector_retrievers},
    # query_engine_dict=vector_query_engines,
    verbose=True,
)

python

# run recursive retriever
nodes = recursive_retriever.retrieve(
    "Tell me about a celebrity from the United States"
)
for node in nodes:
    print(node.node.get_content())

python

nodes = recursive_retriever.retrieve(
    "Tell me about the childhood of a billionaire who started at company at"
    " the age of 16"
)
for node in nodes:
    print(node.node.get_content())