Back to Paradedb

Indexing JSON

docs/documentation/indexing/indexing-json.mdx

0.23.34.3 KB
Original Source

When indexing JSON, ParadeDB automatically indexes all sub-fields of the JSON object. The type of each sub-field is also inferred automatically. For example, consider the following statement where metadata is JSONB:

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, metadata) WITH (key_field='id'); ```
python
from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "metadata": {},
            },
            key_field="id",
            name="search_idx",
        ),
    )
python
from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(MockItem.metadata_),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)
ruby
ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    metadata: {}
  },
  key_field: :id,
  name: :search_idx
)
</CodeGroup>

A single metadata JSON may look like:

json
{ "color": "Silver", "location": "United States" }

ParadeDB will automatically index both metadata.color and metadata.location as text.

By default, all text sub-fields of a JSON object use the same tokenizer. The tokenizer can be configured the same way as text fields:

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, (metadata::pdb.ngram(2,3))) WITH (key_field='id'); ```
python
from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "metadata": {
                    "tokenizer": "ngram",
                    "args": [2, 3],
                },
            },
            key_field="id",
            name="search_idx",
        ),
    )
python
from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(
        MockItem.metadata_,
        tokenizer=indexing.tokenize.ngram(min_gram=2, max_gram=3),
    ),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)
ruby
ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    metadata: { tokenizer: :ngram, named_args: { min: 2, max: 3 } }
  },
  key_field: :id,
  name: :search_idx
)
</CodeGroup>

Instead of indexing the entire JSON, sub-fields of the JSON can be indexed individually. This allows for configuring separate tokenizers within a larger JSON:

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, ((metadata->>'color')::pdb.ngram(2,3))) WITH (key_field='id'); ```
python
from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "metadata": {
                    "json_keys": {
                        "color": {
                            "tokenizer": "ngram",
                            "args": [2, 3],
                        },
                    },
                },
            },
            key_field="id",
            name="search_idx",
        ),
    )
python
from sqlalchemy import Index
from paradedb.sqlalchemy import expr, indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(
        expr.json_text(MockItem.metadata_, "color"),
        tokenizer=indexing.tokenize.ngram(min_gram=2, max_gram=3),
    ),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)
ruby
ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    "metadata->>'color'" => { tokenizer: :ngram, named_args: { min: 2, max: 3 } }
  },
  key_field: :id,
  name: :search_idx
)
</CodeGroup>