Alternative Embeddings

This notebook demonstrates how to use alternative embedding functions.

python

import chromadb

python

client = chromadb.Client()

python

from chromadb.utils import embedding_functions

python

# Using OpenAI Embeddings. This assumes you have the openai package installed
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="OPENAI_KEY", # Replace with your own OpenAI API key
    model_name="text-embedding-ada-002"
)

python

# Create a new chroma collection
openai_collection = client.get_or_create_collection(name="openai_embeddings", embedding_function=openai_ef)

python

openai_collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

python

results = openai_collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

python

# Using Cohere Embeddings. This assumes you have the cohere package installed
cohere_ef  = embedding_functions.CohereEmbeddingFunction(
    api_key="COHERE_API_KEY",  
    model_name="large"
)

python

# Create a new chroma collection
cohere_collection = client.create_collection(name="cohere_embeddings", embedding_function=cohere_ef)

python

cohere_collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

python

results = cohere_collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

python

# Using Instructor models. The embedding function requires the InstructorEmbedding package. 
# To install it, run pip install InstructorEmbedding


#uses base model and cpu
instructor_ef = embedding_functions.InstructorEmbeddingFunction() 

# For task specific embeddings, add an instruction
# instructor_ef = embedding_functions.InstructorEmbeddingFunction(
#     instruction="Represent the Wikipedia document for retrieval: "
# )

# Uses hkunlp/instructor-xl model and GPU
#instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-xl", device="cuda")

python

# Create a collection with the instructor embedding function
instructor_collection = client.create_collection(name="instructor_embeddings", embedding_function=instructor_ef)

python

instructor_collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

# Adding documents with an instruction
# instructor_ef = embedding_functions.InstructorEmbeddingFunction(
#     instruction="Represent the Science sentence: "
# )
# instructor_collection = client.create_collection(name="instructor_embeddings", embedding_function=instructor_ef)
# instructor_collection.add(documents=["Parton energy loss in QCD matter"], ids=["id1"])

python

results = instructor_collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

# Querying with an instruction
# instructor_ef = embedding_functions.InstructorEmbeddingFunction(instruction="Represent the Wikipedia question for retrieving supporting documents: ")
# instructor_collection = client.get_collection(name="instructor_embeddings", embedding_function=instructor_ef)
# results = instructor_collection.query(query_texts=["where is the food stored in a yam plant"])

python

# Using HuggingFace models. The embedding function a huggingface api_key
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key="HUGGINGFACE_API_KEY", # Replace with your own HuggingFace API key
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

python

# Create a new HuggingFace collection
huggingface_collection = client.create_collection(name="huggingface_embeddings", embedding_function=huggingface_ef)

python

huggingface_collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["id1", "id2"]
)

python

results = huggingface_collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results