docs/examples/vector_stores/MongoDBAtlasVectorSearchRAGOpenAI.ipynb
!pip install llama-index
!pip install llama-index-vector-stores-mongodb
!pip install llama-index-embeddings-openai
!pip install pymongo
!pip install datasets
!pip install pandas
%env OPENAI_API_KEY=OPENAI_API_KEY
from datasets import load_dataset
import pandas as pd
# https://huggingface.co/datasets/AIatMongoDB/embedded_movies
dataset = load_dataset("AIatMongoDB/embedded_movies")
# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])
dataset_df.head(5)
# Remove data point where fullplot coloumn is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())
# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI emebedding Model "text-embedding-3-small"
dataset_df = dataset_df.drop(columns=["plot_embedding"])
dataset_df.head(5)
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=256)
llm = OpenAI()
Settings.llm = llm
Settings.embed_model = embed_model
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient="records")
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)
llama_documents = []
for document in documents_list:
# Value for metadata must be one of (str, int, float, None)
document["writers"] = json.dumps(document["writers"])
document["languages"] = json.dumps(document["languages"])
document["genres"] = json.dumps(document["genres"])
document["cast"] = json.dumps(document["cast"])
document["directors"] = json.dumps(document["directors"])
document["countries"] = json.dumps(document["countries"])
document["imdb"] = json.dumps(document["imdb"])
document["awards"] = json.dumps(document["awards"])
# Create a Document object with the text and excluded metadata for llm and embedding models
llama_document = Document(
text=document["fullplot"],
metadata=document,
excluded_llm_metadata_keys=["fullplot", "metacritic"],
excluded_embed_metadata_keys=[
"fullplot",
"metacritic",
"poster",
"num_mflix_comments",
"runtime",
"rated",
],
metadata_template="{key}=>{value}",
text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)
llama_documents.append(llama_document)
# Observing an example of what the LLM and Embedding model receive as input
print(
"\nThe LLM sees this: \n",
llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
"\nThe Embedding model sees this: \n",
llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)
llama_documents[0]
from llama_index.core.node_parser import SentenceSplitter
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)
for node in nodes:
node_embedding = embed_model.get_text_embedding(
node.get_content(metadata_mode="all")
)
node.embedding = node_embedding
Ensure your databse, collection and vector store index is setup on MongoDB Atlas for the collection or the following step won't work appropriately on MongoDB.
For assistance with database cluster setup and obtaining the URI, refer to this guide for setting up a MongoDB cluster, and this guide to get your connection string.
Once you have successfully created a cluster, create the database and collection within the MongoDB Atlas cluster by clicking “+ Create Database”. The database will be named movies, and the collection will be named movies_records.
Creating a vector search index within the movies_records collection is essential for efficient document retrieval from MongoDB into our development environment. To achieve this, refer to the official guide on vector search index creation.
import pymongo
from google.colab import userdata
mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
print("MONGO_URI not set in environment variables")
mongo_client = pymongo.MongoClient(mongo_uri)
async_mongo_client = pymongo.AsyncMongoClient(mongo_uri)
DB_NAME = "movies"
COLLECTION_NAME = "movies_records"
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
# To ensure we are working with a fresh collection
# delete any existing records in the collection
collection.delete_many({})
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
vector_store = MongoDBAtlasVectorSearch(
mongodb_client=mongo_client,
async_mongodb_client=async_mongo_client,
db_name=DB_NAME,
collection_name=COLLECTION_NAME,
index_name="vector_index",
)
vector_store.add(nodes)
from llama_index.core import VectorStoreIndex, StorageContext
index = VectorStoreIndex.from_vector_store(vector_store)
import pprint
from llama_index.core.response.notebook_utils import display_response
query_engine = index.as_query_engine(similarity_top_k=3)
query = "Recommend a romantic movie suitable for the christmas season and justify your selecton"
response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)