docs/examples/cookbooks/mongodb_retrieval_strategies.ipynb
In this notebook, we will explore and tune different retrieval options in MongoDB's LlamaIndex integration to get the most relevant results.
!pip install -qU pymongo llama-index llama-index-llms-openai llama-index-vector-stores-mongodb
Set the MongoDB connection string: Follow the steps here to get the connection string from the Atlas UI.
Set the OpenAI API key: Steps to obtain an API key as here
import os
import getpass
from pymongo import MongoClient, AsyncMongoClient
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
MONGODB_URI = getpass.getpass("Enter your MongoDB URI: ")
mongodb_client = MongoClient(
MONGODB_URI, appname="devrel.content.retrieval_strategies_llamaindex"
)
async_mongodb_client = AsyncMongoClient(
MONGODB_URI, appname="devrel.content.retrieval_strategies_llamaindex"
)
from datasets import load_dataset
import pandas as pd
from llama_index.core import Document
data = load_dataset("MongoDB/embedded_movies", split="train")
data = pd.DataFrame(data)
data.head()
# Fill Nones in the dataframe
data = data.fillna(
{"genres": "[]", "languages": "[]", "cast": "[]", "imdb": "{}"}
)
documents = []
for _, row in data.iterrows():
# Extract required fields
title = row["title"]
rating = row["imdb"].get("rating", 0)
languages = row["languages"]
cast = row["cast"]
genres = row["genres"]
# Create the metadata attribute
metadata = {"title": title, "rating": rating, "languages": languages}
# Create the text attribute
text = f"Title: {title}\nPlot: {row['fullplot']}\nCast: {', '.join(item for item in cast)}\nGenres: {', '.join(item for item in genres)}\nLanguages: {', '.join(item for item in languages)}\nRating: {rating}"
documents.append(Document(text=text, metadata=metadata))
print(documents[0].text)
print(documents[0].metadata)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from pymongo.operations import SearchIndexModel
from pymongo.errors import OperationFailure
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
VS_INDEX_NAME = "vector_index"
FTS_INDEX_NAME = "fts_index"
DB_NAME = "llamaindex"
COLLECTION_NAME = "hybrid_search"
collection = mongodb_client[DB_NAME][COLLECTION_NAME]
vector_store = MongoDBAtlasVectorSearch(
mongodb_client=mongodb_client,
async_mongodb_client=async_mongodb_client,
db_name=DB_NAME,
collection_name=COLLECTION_NAME,
vector_index_name=VS_INDEX_NAME,
fulltext_index_name=FTS_INDEX_NAME,
embedding_key="embedding",
text_key="text",
)
# If the collection has documents with embeddings already, create the vector store index from the vector store
if collection.count_documents({}) > 0:
vector_store_index = VectorStoreIndex.from_vector_store(vector_store)
# If the collection does not have documents, embed and ingest them into the vector store
else:
vector_store_context = StorageContext.from_defaults(
vector_store=vector_store
)
vector_store_index = VectorStoreIndex.from_documents(
documents, storage_context=vector_store_context, show_progress=True
)
vs_model = SearchIndexModel(
definition={
"fields": [
{
"type": "vector",
"path": "embedding",
"numDimensions": 1536,
"similarity": "cosine",
},
{"type": "filter", "path": "metadata.rating"},
{"type": "filter", "path": "metadata.language"},
]
},
name=VS_INDEX_NAME,
type="vectorSearch",
)
fts_model = SearchIndexModel(
definition={
"mappings": {"dynamic": False, "fields": {"text": {"type": "string"}}}
},
name=FTS_INDEX_NAME,
type="search",
)
for model in [vs_model, fts_model]:
try:
collection.create_search_index(model=model)
except OperationFailure:
print(
f"Duplicate index found for model {model}. Skipping index creation."
)
def get_recommendations(query: str, mode: str, **kwargs) -> None:
"""
Get movie recommendations
Args:
query (str): User query
mode (str): Retrieval mode. One of (default, text_search, hybrid)
"""
query_engine = vector_store_index.as_query_engine(
similarity_top_k=5, vector_store_query_mode=mode, **kwargs
)
response = query_engine.query(query)
nodes = response.source_nodes
for node in nodes:
title = node.metadata["title"]
rating = node.metadata["rating"]
score = node.score
print(f"Title: {title} | Rating: {rating} | Relevance Score: {score}")
get_recommendations(
query="Action movies about humans fighting machines",
mode="text_search",
)
get_recommendations(
query="Action movies about humans fighting machines", mode="default"
)
# Vector and full-text search weighted equal by default
get_recommendations(
query="Action movies about humans fighting machines", mode="hybrid"
)
# Higher alpha, vector search dominates
get_recommendations(
query="Action movies about humans fighting machines",
mode="hybrid",
alpha=0.7,
)
# Lower alpha, full-text search dominates
get_recommendations(
query="Action movies about humans fighting machines",
mode="hybrid",
alpha=0.3,
)
from llama_index.core.vector_stores import (
MetadataFilter,
MetadataFilters,
FilterOperator,
FilterCondition,
)
filters = MetadataFilters(
filters=[
MetadataFilter(
key="metadata.rating", value=7, operator=FilterOperator.GT
),
MetadataFilter(
key="metadata.languages",
value="English",
operator=FilterOperator.EQ,
),
],
condition=FilterCondition.AND,
)
get_recommendations(
query="Action movies about humans fighting machines",
mode="hybrid",
alpha=0.7,
filters=filters,
)