Lindorm

Lindorm is a cloud native multi-model database service. It allows you to store data of all sizes. Lindorm supports low-cost storage and processing of large amounts of data and the pay-as-you-go billing method. It is compatible with the open standards of multiple open source software, such as Apache HBase, Apache Cassandra, Apache Phoenix, OpenTSDB, Apache Solr, and SQL.

To run this notebook you need a Lindorm instance running in the cloud. You can get one following this link.

After creating the instance, you can get your instance information and run curl commands to connect to and use LindormSearch

Setup

If you're opening this Notebook on colab, you will probably need to ensure you have llama-index installed:

python

!pip install llama-index

python

!pip install opensearch-py

python

%pip install llama-index-vector-stores-lindorm

python

# choose dashscope as embedding and llm model, your can also use default openai or other model to test
%pip install llama-index-embeddings-dashscope
%pip install llama-index-llms-dashscope

import needed package dependencies:

python

from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.lindorm import (
    LindormVectorStore,
    LindormVectorClient,
)
from llama_index.core import VectorStoreIndex, StorageContext

Config dashscope embedding and llm model, your can also use default openai or other model to test

python

# set Embbeding model
from llama_index.core import Settings
from llama_index.embeddings.dashscope import DashScopeEmbedding

# Global Settings
Settings.embed_model = DashScopeEmbedding()

python

# config llm model
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels

dashscope_llm = DashScope(model_name=DashScopeGenerationModels.QWEN_MAX)

Download example data:

python

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

Load Data:

python

# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(f"Total documents: {len(documents)}")
print(f"First document, id: {documents[0].doc_id}")
print(f"First document, hash: {documents[0].hash}")
print(
    "First document, text"
    f" ({len(documents[0].text)} characters):\n{'='*20}\n{documents[0].text[:360]} ..."
)

Create the Lindorm Vector Store object:

python

# only for jupyter notebook
import nest_asyncio

nest_asyncio.apply()

# lindorm instance info
host = "ld-bp******jm*******-proxy-search-pub.lindorm.aliyuncs.com"
port = 30070
username = "your_username"
password = "your_password"


# index demonstrate the VectorStore impl
index_name = "lindorm_rag_test"

# extenion param of lindorm search, number of cluster units to query; between 1 and method.parameters.nlist(ivfpq param); no default value.
nprobe = "2"

# extenion param of lindorm search, usually used to improve recall accuracy, but it increases performance overhead; between 1 and 200; default: 10.
reorder_factor = "10"

#  LindormVectorClient encapsulates logic for a single index with vector search enabled
client = LindormVectorClient(
    host,
    port,
    username,
    password,
    index=index_name,
    dimension=1536,  # match dimension of your embedding model
    nprobe=nprobe,
    reorder_factor=reorder_factor,
    # filter_type="pre_filter/post_filter(default)"
)

# initialize vector store
vector_store = LindormVectorStore(client)

Build the Index from the Documents:

python

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# initialize an index using our sample data and the client we just created
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context, show_progress=True
)

Querying the store:

Search Test

python

# Set Retriever
vector_retriever = index.as_retriever()
# search
source_nodes = vector_retriever.retrieve("What did the author do growing up?")
# check source_nodes
for node in source_nodes:
    # print(node.metadata)
    print(f"---------------------------------------------")
    print(f"Score: {node.score:.3f}")
    print(node.get_content())
    print(f"---------------------------------------------\n\n")

Basic Querying

python

# run query
query_engine = index.as_query_engine(llm=dashscope_llm)
# query_engine = index.as_query_engine()
res = query_engine.query("What did the author do growing up?")
res.response

Metadata Filtering

Lindorm Vector Store now supports metadata filtering in the form of exact-match key=value pairs and range fliter in the form of >、<、>=、<= at query time.

python

from llama_index.core import Document
from llama_index.core.vector_stores import (
    MetadataFilters,
    MetadataFilter,
    FilterOperator,
    FilterCondition,
)
import regex as re

python

# Split the text into paragraphs.
text_chunks = documents[0].text.split("\n\n")

# Create a document for each footnote
footnotes = [
    Document(
        text=chunk,
        id=documents[0].doc_id,
        metadata={
            "is_footnote": bool(re.search(r"^\s*\[\d+\]\s*", chunk)),
            "mark_id": i,
        },
    )
    for i, chunk in enumerate(text_chunks)
    if bool(re.search(r"^\s*\[\d+\]\s*", chunk))
]

python

# Insert the footnotes into the index
for f in footnotes:
    index.insert(f)

python

retriever = index.as_retriever(
    filters=MetadataFilters(
        filters=[
            MetadataFilter(
                key="is_footnote", value="true", operator=FilterOperator.EQ
            ),
            MetadataFilter(
                key="mark_id", value=0, operator=FilterOperator.GTE
            ),
        ],
        condition=FilterCondition.AND,
    ),
)

result = retriever.retrieve("What did the author about space aliens and lisp?")

print(result)

python

# Create a query engine that only searches certain footnotes.
footnote_query_engine = index.as_query_engine(
    filters=MetadataFilters(
        filters=[
            MetadataFilter(
                key="is_footnote", value="true", operator=FilterOperator.EQ
            ),
            MetadataFilter(
                key="mark_id", value=0, operator=FilterOperator.GTE
            ),
        ],
        condition=FilterCondition.AND,
    ),
    llm=dashscope_llm,
)

res = footnote_query_engine.query(
    "What did the author about space aliens and lisp?"
)
res.response

Hybrid Search

The Lindorm search support hybrid search, note the minimum search granularity of query str is one token.

python

from llama_index.core.vector_stores.types import VectorStoreQueryMode

retriever = index.as_retriever(
    vector_store_query_mode=VectorStoreQueryMode.HYBRID
)

result = retriever.retrieve("What did the author about space aliens and lisp?")

print(result)

python

query_engine = index.as_query_engine(
    llm=dashscope_llm, vector_store_query_mode=VectorStoreQueryMode.HYBRID
)
res = query_engine.query("What did the author about space aliens and lisp?")
res.response