docs/documentation/query-builder/specialized/more-like-this.mdx
The more like this (MLT) query finds documents that are "like" another document.
To use this query, pass the key field value of the input document
to pdb.more_like_this.
For instance, the following query finds documents that are "like" a document with an id of 3:
from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3)
.select(:id, :description, :rating, :category)
.order(:id)
id | description | rating | category
----+----------------------+--------+----------
3 | Sleek running shoes | 5 | Footwear
4 | White jogging shoes | 3 | Footwear
5 | Generic shoes | 4 | Footwear
13 | Sturdy hiking boots | 4 | Footwear
23 | Comfortable slippers | 3 | Footwear
33 | Winter woolen socks | 5 | Footwear
(6 rows)
In the output above, notice that documents matching any of the indexed fields, description, rating, and category, were returned.
This is because, by default, all fields present in the index are considered for matching.
To find only documents that match on specific fields, provide an array of field names as the second argument:
<CodeGroup> ```sql SQL SELECT id, description, rating, category FROM mock_items WHERE id @@@ pdb.more_like_this(3, ARRAY['description']) ORDER BY id; ```from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, fields=['description']))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, fields=["description"]))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, fields: [:description])
.select(:id, :description, :rating, :category)
.order(:id)
id | description | rating | category
----+---------------------+--------+----------
3 | Sleek running shoes | 5 | Footwear
4 | White jogging shoes | 3 | Footwear
5 | Generic shoes | 4 | Footwear
(3 rows)
Let's look at how the MLT query works under the hood:
sleek, running, and shoes for the description field; 5 for the rating field; footwear for the category field.In addition to providing a key field value, a custom document can also be provided as JSON. The JSON keys are field names and must correspond to field names in the index.
<CodeGroup> ```sql SQL SELECT id, description, rating, category FROM mock_items WHERE id @@@ pdb.more_like_this('{"description": "Sleek running shoes", "category": "footwear"}') ORDER BY id; ```from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(document={'description': 'Sleek running shoes', 'category': 'footwear'}))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(
search.more_like_this(
MockItem.id,
document={"description": "Sleek running shoes", "category": "footwear"},
)
)
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this({ description: "Sleek running shoes", category: "footwear" }.to_json)
.select(:id, :description, :rating, :category)
.order(:id)
min_term_frequency excludes terms that appear fewer than a certain number of times in the input document,
while max_term_frequency excludes terms that appear more than that many times. By default, no terms are excluded
based on term frequency.
For instance, the following query returns no results because no term appears twice in the input document.
<CodeGroup> ```sql SQL SELECT id, description, rating, category FROM mock_items WHERE id @@@ pdb.more_like_this(3, min_term_frequency => 2) ORDER BY id; ```from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, min_term_freq=2))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, min_term_frequency=2))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, min_term_freq: 2)
.select(:id, :description, :rating, :category)
.order(:id)
min_doc_frequency excludes terms that appear in fewer than a certain number of documents across the entire index,
while max_doc_frequency excludes terms that appear in more than that many documents. By default, no terms are excluded
based on document frequency.
from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, min_doc_freq=3))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, min_doc_frequency=3))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, min_doc_freq: 3)
.select(:id, :description, :rating, :category)
.order(:id)
By default, only the top 25 terms across all fields are considered for matching. Terms are scored using a combination of inverse document frequency and term frequency (TF-IDF) -- this means that terms that appear frequently in the input document and are rare across the index score the highest.
This can be configured with max_query_terms:
from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, max_query_terms=10))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, max_query_terms=10))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, max_query_terms: 10)
.select(:id, :description, :rating, :category)
.order(:id)
min_word_length and max_word_length can be used to exclude terms that are too short or too long, respectively. By default, no terms
are excluded based on length.
from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, min_word_length=5))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, min_word_length=5))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, min_word_length: 5)
.select(:id, :description, :rating, :category)
.order(:id)
To exclude terms from being considered, provide a text array to stopwords:
from paradedb import MoreLikeThis, ParadeDB
MockItem.objects.filter(
id=ParadeDB(MoreLikeThis(id=3, stopwords=['the', 'a']))
).values('id', 'description', 'rating', 'category').order_by('id')
from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import search
stmt = (
select(MockItem.id, MockItem.description, MockItem.rating, MockItem.category)
.where(search.more_like_this(MockItem.id, document_id=3, stopwords=["the", "a"]))
.order_by(MockItem.id)
)
with Session(engine) as session:
session.execute(stmt).all()
MockItem.more_like_this(3, stopwords: %w[the a])
.select(:id, :description, :rating, :category)
.order(:id)