Forward/Backward Augmentation

Showcase capabilities of leveraging Node relationships on top of PG's essay

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

python

!pip install llama-index

python

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import (
    PrevNextNodePostprocessor,
    AutoPrevNextNodePostprocessor,
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore

Download Data

python

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

Parse Documents into Nodes, add to Docstore

python

# load documents
from llama_index.core import StorageContext


documents = SimpleDirectoryReader("./data/paul_graham").load_data()

# define settings
from llama_index.core import Settings

Settings.chunk_size = 512

# use node parser in settings to parse into nodes
nodes = Settings.node_parser.get_nodes_from_documents(documents)

# add to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(docstore=docstore)

Build Index

python

# build index
index = VectorStoreIndex(nodes, storage_context=storage_context)

Add PrevNext Node Postprocessor

python

node_postprocessor = PrevNextNodePostprocessor(docstore=docstore, num_nodes=4)

python

query_engine = index.as_query_engine(
    similarity_top_k=1,
    node_postprocessors=[node_postprocessor],
    response_mode="tree_summarize",
)
response = query_engine.query(
    "What did the author do after handing off Y Combinator to Sam Altman?",
)

python

print(response)

python

# Try querying index without node postprocessor
query_engine = index.as_query_engine(
    similarity_top_k=1, response_mode="tree_summarize"
)
response = query_engine.query(
    "What did the author do after handing off Y Combinator to Sam Altman?",
)

python

print(response)

python

# Try querying index without node postprocessor and higher top-k
query_engine = index.as_query_engine(
    similarity_top_k=3, response_mode="tree_summarize"
)
response = query_engine.query(
    "What did the author do after handing off Y Combinator to Sam Altman?",
)

python

print(response)

Add Auto Prev/Next Node Postprocessor

python

node_postprocessor = AutoPrevNextNodePostprocessor(
    docstore=docstore,
    num_nodes=3,
    verbose=True,
)

python

# Infer that we need to search nodes after current one
query_engine = index.as_query_engine(
    similarity_top_k=1,
    node_postprocessors=[node_postprocessor],
    response_mode="tree_summarize",
)
response = query_engine.query(
    "What did the author do after handing off Y Combinator to Sam Altman?",
)

python

print(response)

python

# Infer that we don't need to search previous or next
response = query_engine.query(
    "What did the author do during his time at Y Combinator?",
)

python

print(response)

python

# Infer that we need to search nodes before current one
response = query_engine.query(
    "What did the author do before handing off Y Combinator to Sam Altman?",
)

python

print(response)

python

response = query_engine.query(
    "What did the author do before handing off Y Combinator to Sam Altman?",
)

python

print(response)