MongoDB Atlas + OpenAI RAG Example

python

!pip install llama-index
!pip install llama-index-vector-stores-mongodb
!pip install llama-index-embeddings-openai
!pip install pymongo
!pip install datasets
!pip install pandas

python

%env OPENAI_API_KEY=OPENAI_API_KEY

python

from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/embedded_movies
dataset = load_dataset("AIatMongoDB/embedded_movies")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

python

# Remove data point where fullplot coloumn is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI emebedding Model "text-embedding-3-small"
dataset_df = dataset_df.drop(columns=["plot_embedding"])

dataset_df.head(5)

python

from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=256)
llm = OpenAI()

Settings.llm = llm
Settings.embed_model = embed_model

python

import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient="records")
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:
    # Value for metadata must be one of (str, int, float, None)
    document["writers"] = json.dumps(document["writers"])
    document["languages"] = json.dumps(document["languages"])
    document["genres"] = json.dumps(document["genres"])
    document["cast"] = json.dumps(document["cast"])
    document["directors"] = json.dumps(document["directors"])
    document["countries"] = json.dumps(document["countries"])
    document["imdb"] = json.dumps(document["imdb"])
    document["awards"] = json.dumps(document["awards"])

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=document["fullplot"],
        metadata=document,
        excluded_llm_metadata_keys=["fullplot", "metacritic"],
        excluded_embed_metadata_keys=[
            "fullplot",
            "metacritic",
            "poster",
            "num_mflix_comments",
            "runtime",
            "rated",
        ],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

python

llama_documents[0]

python

from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

Ensure your databse, collection and vector store index is setup on MongoDB Atlas for the collection or the following step won't work appropriately on MongoDB.

For assistance with database cluster setup and obtaining the URI, refer to this guide for setting up a MongoDB cluster, and this guide to get your connection string.
Once you have successfully created a cluster, create the database and collection within the MongoDB Atlas cluster by clicking “+ Create Database”. The database will be named movies, and the collection will be named movies_records.
Creating a vector search index within the movies_records collection is essential for efficient document retrieval from MongoDB into our development environment. To achieve this, refer to the official guide on vector search index creation.

python

import pymongo
from google.colab import userdata


mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = pymongo.MongoClient(mongo_uri)
async_mongo_client = pymongo.AsyncMongoClient(mongo_uri)

DB_NAME = "movies"
COLLECTION_NAME = "movies_records"

db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

python

# To ensure we are working with a fresh collection
# delete any existing records in the collection
collection.delete_many({})

python

from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongodb_client=mongo_client,
    async_mongodb_client=async_mongo_client,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
    index_name="vector_index",
)
vector_store.add(nodes)

python

from llama_index.core import VectorStoreIndex, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

python

import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine(similarity_top_k=3)

query = "Recommend a romantic movie suitable for the christmas season and justify your selecton"

response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)