docs/examples/retrievers/bm25_retriever.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/retrievers/bm25_retriever.ipynb" target="_parent"></a>
In this guide, we define a bm25 retriever that search documents using the bm25 method. BM25 (Best Matching 25) is a ranking function that extends TF-IDF by considering term frequency saturation and document length. BM25 effectively ranks documents based on query term occurrence and rarity across the corpus.
This notebook is very similar to the RouterQueryEngine notebook.
If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.
%pip install llama-index
%pip install llama-index-retrievers-bm25
import os
os.environ["OPENAI_API_KEY"] = "sk-proj-..."
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
We first show how to convert a Document into a set of Nodes, and insert into a DocumentStore.
from llama_index.core import SimpleDirectoryReader
# load documents
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
from llama_index.core.node_parser import SentenceSplitter
# initialize node parser
splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)
One option is to create the BM25Retriever directly from nodes, and save to and from disk.
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
nodes=nodes,
similarity_top_k=2,
# Optional: We can pass in the stemmer and set the language for stopwords
# This is important for removing stopwords and stemming the query + text
# The default is english for both
stemmer=Stemmer.Stemmer("english"),
language="english",
)
bm25_retriever.persist("./bm25_retriever")
loaded_bm25_retriever = BM25Retriever.from_persist_dir("./bm25_retriever")
Here, we cover using a BM25Retriever with a docstore to hold your nodes. The advantage here is that the docstore can be remote (mongodb, redis, etc.)
# initialize a docstore to store nodes
# also available are mongodb, redis, postgres, etc for docstores
from llama_index.core.storage.docstore import SimpleDocumentStore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
# We can pass in the index, docstore, or list of nodes to create the retriever
bm25_retriever = BM25Retriever.from_defaults(
docstore=docstore,
similarity_top_k=2,
# Optional: We can pass in the stemmer and set the language for stopwords
# This is important for removing stopwords and stemming the query + text
# The default is english for both
stemmer=Stemmer.Stemmer("english"),
language="english",
)
from llama_index.core.response.notebook_utils import display_source_node
# will retrieve context from specific companies
retrieved_nodes = bm25_retriever.retrieve(
"What happened at Viaweb and Interleaf?"
)
for node in retrieved_nodes:
display_source_node(node, source_length=5000)
retrieved_nodes = bm25_retriever.retrieve("What did the author do after RISD?")
for node in retrieved_nodes:
display_source_node(node, source_length=5000)
# Intialize document with some metadata
from llama_index.core import Document
documents = [
Document(text="Hello, world!", metadata={"key": "1"}),
Document(text="Hello, world! 2", metadata={"key": "2"}),
Document(text="Hello, world! 3", metadata={"key": "3"}),
Document(text="Hello, world! 2.1", metadata={"key": "2"}),
]
# Initialize node parser
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)
# Add nodes to docstore
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
# Define metadata filters
from llama_index.core.vector_stores.types import (
MetadataFilters,
MetadataFilter,
FilterOperator,
FilterCondition,
)
filters = MetadataFilters(
filters=[
MetadataFilter(
key="key",
value="2",
operator=FilterOperator.EQ,
)
],
condition=FilterCondition.AND,
)
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
retrieved_nodes = BM25Retriever.from_defaults(
docstore=docstore,
similarity_top_k=3,
filters=filters, # Add filters here
stemmer=Stemmer.Stemmer("english"),
language="english",
).retrieve("Hello, world!")
for node in retrieved_nodes:
display_source_node(node, source_length=5000)
Now we will combine bm25 and chroma for sparse and dense retrieval.
The results are combined using the QueryFusionRetriever.
With the retriever, we can make a complete RetrieverQueryEngine.
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
docstore = SimpleDocumentStore()
docstore.add_documents(nodes)
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("dense_vectors")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(
docstore=docstore, vector_store=vector_store
)
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)
import nest_asyncio
nest_asyncio.apply()
from llama_index.core.retrievers import QueryFusionRetriever
retriever = QueryFusionRetriever(
[
index.as_retriever(similarity_top_k=2),
BM25Retriever.from_defaults(
docstore=index.docstore, similarity_top_k=2
),
],
num_queries=1,
use_async=True,
)
nodes = retriever.retrieve("What happened at Viaweb and Interleaf?")
for node in nodes:
display_source_node(node, source_length=5000)
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine(retriever)
response = query_engine.query("What did the author do after RISD?")
print(response)
With our data in chroma, and our nodes in our docstore, we can save and recreate!
The vector store is already saved automatically by chroma, but we will need to save our docstore.
storage_context.docstore.persist("./docstore.json")
# or, we could ignore the docstore and just persist the bm25 retriever as shown below
# bm25_retriever.persist("./bm25_retriever")
Now, we can reload and re-create our index.
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("dense_vectors")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
docstore = SimpleDocumentStore.from_persist_path("./docstore.json")
storage_context = StorageContext.from_defaults(
docstore=docstore, vector_store=vector_store
)
index = VectorStoreIndex(nodes=[], storage_context=storage_context)