Create an Index - Paradedb

Before a table can be searched, it must be indexed. ParadeDB uses a custom index type called the BM25 index. The following code block creates a BM25 index over several columns in the mock_items table.

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, description, category) WITH (key_field='id'); ```

python

from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "description": {},
                "category": {},
            },
            key_field="id",
            name="search_idx",
        ),
    )

python

from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(MockItem.description),
    indexing.BM25Field(MockItem.category),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)

ruby

ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    description: {},
    category: {}
  },
  key_field: :id,
  name: :search_idx
)

</CodeGroup> <Note> See the [getting started guide](/documentation/getting-started/environment) for more detail on how to set up your ORM to run index creation commands. </Note> <Note> You'll need to drop the existing `search_idx` before you can create a new one: <CodeGroup>

sql

DROP INDEX search_idx;

python

from django.db import connection

with connection.cursor() as cursor:
    cursor.execute("DROP INDEX search_idx")

python

from sqlalchemy import text

with engine.begin() as conn:
    conn.execute(text("DROP INDEX search_idx"))

ruby

ActiveRecord::Base.connection.remove_bm25_index(:mock_items, name: :search_idx)

</CodeGroup> </Note>

By default, text columns are tokenized using the unicode tokenizer, which splits text according to the Unicode segmentation standard. Because index creation is a time-consuming operation, we recommend experimenting with the available tokenizers to find the most suitable one before running CREATE INDEX.

For instance, if a column contains multiple languages, the ICU tokenizer may be more appropriate.

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, (description::pdb.icu), category) WITH (key_field='id'); ```

python

from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "description": {"tokenizer": "icu"},
                "category": {},
            },
            key_field="id",
            name="search_idx",
        ),
    )

python

from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(
        MockItem.description,
        tokenizer=indexing.tokenize.icu(),
    ),
    indexing.BM25Field(MockItem.category),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)

ruby

ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    description: { tokenizer: :icu },
    category: {}
  },
  key_field: :id,
  name: :search_idx
)

</CodeGroup>

Only one BM25 index can exist per table. We recommend indexing all columns in a table that may be present in a search query, including columns used for sorting, grouping, filtering, and aggregations.

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, description, category, rating, in_stock, created_at, metadata, weight_range) WITH (key_field='id'); ```

python

from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "description": {},
                "category": {},
                "rating": {},
                "in_stock": {},
                "created_at": {},
                "metadata": {},
                "weight_range": {},
            },
            key_field="id",
            name="search_idx",
        ),
    )

python

from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(MockItem.description),
    indexing.BM25Field(MockItem.category),
    indexing.BM25Field(MockItem.rating),
    indexing.BM25Field(MockItem.in_stock),
    indexing.BM25Field(MockItem.created_at),
    indexing.BM25Field(MockItem.metadata_),
    indexing.BM25Field(MockItem.weight_range),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)

ruby

ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    description: {},
    category: {},
    rating: {},
    in_stock: {},
    created_at: {},
    metadata: {},
    weight_range: {}
  },
  key_field: :id,
  name: :search_idx
)

</CodeGroup>

Most Postgres types, including text, JSON, numeric, timestamp, range, boolean, and arrays, can be indexed.

Track Create Index Progress

To monitor the progress of a long-running CREATE INDEX, open a separate Postgres connection and query pg_stat_progress_create_index:

sql

SELECT pid, phase, blocks_done, blocks_total
FROM pg_stat_progress_create_index;

Comparing blocks_done to blocks_total will provide a good approximation of the progress so far. If blocks_done equals blocks_total, that means that all rows have been indexed and the index is being flushed to disk.

Choosing a Key Field

In the CREATE INDEX statement above, note the mandatory key_field option. Every BM25 index needs a key_field, which is the name of a column that will function as a row’s unique identifier within the index.

The key_field must:

Have a UNIQUE constraint. Usually this means the table's PRIMARY KEY.
Be the first column in the column list.
Be untokenized, if it is a text field.

Token Filters

After tokens are created, token filters can be configured to apply further processing like lowercasing, stemming, or unaccenting. For example, the following code block adds English stemming to description:

<CodeGroup> ```sql SQL CREATE INDEX search_idx ON mock_items USING bm25 (id, (description::pdb.simple('stemmer=english')), category) WITH (key_field='id'); ```

python

from django.db import connection
from paradedb.indexes import BM25Index

with connection.schema_editor() as schema_editor:
    schema_editor.add_index(
        MockItem,
        BM25Index(
            fields={
                "id": {},
                "description": {
                    "tokenizer": "simple",
                    "filters": ["stemmer"],
                    "stemmer": "english",
                },
                "category": {},
            },
            key_field="id",
            name="search_idx",
        ),
    )

python

from sqlalchemy import Index
from paradedb.sqlalchemy import indexing

idx = Index(
    "search_idx",
    indexing.BM25Field(MockItem.id),
    indexing.BM25Field(
        MockItem.description,
        tokenizer=indexing.tokenize.simple(
            filters=["stemmer"],
            stemmer="english",
        ),
    ),
    indexing.BM25Field(MockItem.category),
    postgresql_using="bm25",
    postgresql_with={"key_field": "id"},
)

with engine.begin() as conn:
    idx.create(conn)

ruby

ActiveRecord::Base.connection.add_bm25_index(
  :mock_items,
  fields: {
    id: {},
    description: {
      tokenizer: :simple,
      filters: [:stemmer],
      stemmer: "english"
    },
    category: {}
  },
  key_field: :id,
  name: :search_idx
)

</CodeGroup>