Back to Llama Index

Tablestore Demo

docs/examples/docstore/TablestoreDocstoreDemo.ipynb

0.14.216.7 KB
Original Source

Tablestore Demo

This guide shows you how to directly use our DocumentStore abstraction backed by Tablestore. By putting nodes in the docstore, this allows you to define multiple indices over the same underlying docstore, instead of duplicating data across indices.

python
%pip install llama-index-storage-docstore-tablestore
%pip install llama-index-storage-index-store-tablestore
%pip install llama-index-vector-stores-tablestore

%pip install llama-index-llms-dashscope
%pip install llama-index-embeddings-dashscope

%pip install llama-index
%pip install matplotlib
python
import nest_asyncio

nest_asyncio.apply()
python
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
python
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex
from llama_index.core import SummaryIndex
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import Settings

Config Tablestore

Next, we use tablestore's docsstore to perform a demo.

python
import getpass
import os

os.environ["tablestore_end_point"] = getpass.getpass("tablestore end_point:")
os.environ["tablestore_instance_name"] = getpass.getpass(
    "tablestore instance_name:"
)
os.environ["tablestore_access_key_id"] = getpass.getpass(
    "tablestore access_key_id:"
)
os.environ["tablestore_access_key_secret"] = getpass.getpass(
    "tablestore access_key_secret:"
)

Config DashScope LLM

Next, we use dashscope's llm to perform a demo.

python
import os
import getpass

os.environ["DASHSCOPE_API_KEY"] = getpass.getpass("DashScope api key:")

Download Data

python
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

Load Documents

python
reader = SimpleDirectoryReader("./data/paul_graham/")
documents = reader.load_data()

Parse into Nodes

python
from llama_index.core.node_parser import SentenceSplitter

nodes = SentenceSplitter().get_nodes_from_documents(documents)

Init Store/Embedding/LLM/StorageContext

python
from llama_index.storage.docstore.tablestore import TablestoreDocumentStore
from llama_index.storage.index_store.tablestore import TablestoreIndexStore
from llama_index.vector_stores.tablestore import TablestoreVectorStore
from llama_index.embeddings.dashscope import (
    DashScopeEmbedding,
    DashScopeTextEmbeddingModels,
    DashScopeTextEmbeddingType,
)
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels

embedder = DashScopeEmbedding(
    model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3,  # default demiension is 1024
    text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)

dashscope_llm = DashScope(
    model_name=DashScopeGenerationModels.QWEN_MAX,
    api_key=os.environ["DASHSCOPE_API_KEY"],
)
Settings.llm = dashscope_llm

docstore = TablestoreDocumentStore.from_config(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
)

index_store = TablestoreIndexStore.from_config(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
)

vector_store = TablestoreVectorStore(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
    vector_dimension=1024,  # embedder dimension is 1024
)
vector_store.create_table_if_not_exist()
vector_store.create_search_index_if_not_exist()

storage_context = StorageContext.from_defaults(
    docstore=docstore, index_store=index_store, vector_store=vector_store
)

Add to docStore

python
storage_context.docstore.add_documents(nodes)

Define & Add Multiple Indexes

Each index uses the same underlying Node.

python
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/list.html
summary_index = SummaryIndex(nodes, storage_context=storage_context)
python
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/vector_store.html
vector_index = VectorStoreIndex(
    nodes,
    insert_batch_size=20,
    embed_model=embedder,
    storage_context=storage_context,
)
python
# https://gpt-index.readthedocs.io/en/latest/api_reference/indices/table.html
keyword_table_index = SimpleKeywordTableIndex(
    nodes=nodes,
    storage_context=storage_context,
    llm=dashscope_llm,
)
python
# NOTE: the docstore still has the same nodes
len(storage_context.docstore.docs)

Test out saving and loading

python
# NOTE: docstore and index_store is persisted in Tablestore by default
# NOTE: here only need to persist simple vector store to disk
storage_context.persist()
python
# note down index IDs
list_id = summary_index.index_id
vector_id = vector_index.index_id
keyword_id = keyword_table_index.index_id
print(list_id, vector_id, keyword_id)
python
from llama_index.core import load_index_from_storage

# re-create storage context
storage_context = StorageContext.from_defaults(
    docstore=docstore, index_store=index_store, vector_store=vector_store
)

summary_index = load_index_from_storage(
    storage_context=storage_context,
    index_id=list_id,
)
keyword_table_index = load_index_from_storage(
    llm=dashscope_llm,
    storage_context=storage_context,
    index_id=keyword_id,
)
# You need to add "vector_store=xxx" to StorageContext to load vector index from Tablestore
vector_index = load_index_from_storage(
    insert_batch_size=20,
    embed_model=embedder,
    storage_context=storage_context,
    index_id=vector_id,
)

Test out some Queries

python
Settings.llm = dashscope_llm
Settings.chunk_size = 1024
python
query_engine = summary_index.as_query_engine()
list_response = query_engine.query("What is a summary of this document?")
python
display_response(list_response)
python
query_engine = vector_index.as_query_engine()
vector_response = query_engine.query("What did the author do growing up?")
python
display_response(vector_response)
python
query_engine = keyword_table_index.as_query_engine()
keyword_response = query_engine.query(
    "What did the author do after his time at YC?"
)
python
display_response(keyword_response)