docs/examples/line_based_chunking.ipynb
The LineBasedTokenChunker is a tokenization-aware chunker that preserves line boundaries. It's particularly useful for structured content like tables, code, or logs where line boundaries are semantically important.
Key features:
%pip install -qU pip docling transformers
from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer
In this example, we'll chunk a table while repeating the header in each chunk.
# Setup tokenizer with a reasonable token limit
tokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
max_tokens=50, # Small limit to demonstrate chunking
)
# Create chunker with table header prefix
chunker = LineBasedTokenChunker(
tokenizer=tokenizer,
prefix="| Name | Age | Department |\n|------|-----|------------|\n",
omit_prefix_on_overflow=False, # Always include prefix (default)
)
# Sample table rows
lines = [
"| Alice | 30 | Engineering |\n",
"| Bob | 25 | Marketing |\n",
"| Charlie | 35 | Sales |\n",
"| Diana | 28 | HR |\n",
"| Eve | 32 | Finance |\n",
]
print(f"Max tokens: {chunker.max_tokens}")
print(f"Prefix token count: {chunker.prefix_len}\n")
chunks = chunker.chunk_text(lines)
print(f"Total chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
print(f"=== Chunk {i} ===")
print(chunk)
print(f"Tokens: {tokenizer.count_tokens(chunk)}\n")
omit_prefix_on_overflowWhen working with wide tables, some rows might fit without the header but not with it. The omit_prefix_on_overflow parameter provides flexibility in these cases.
# Setup tokenizer with a very small token limit
tokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
max_tokens=30, # Very small limit to force overflow
)
# Create chunker with a longer prefix
prefix = (
"| Name | Age | Department | Location |\n|------|-----|------------|----------|\n"
)
print(f"Prefix token count: {tokenizer.count_tokens(prefix)}")
print(f"Max tokens: {tokenizer.get_max_tokens()}\n")
# Sample lines - some will be too long with prefix
lines = [
"| Alice Johnson | 30 | Engineering | San Francisco |\n",
"| Bob Smith | 25 | Marketing | New York |\n",
"| Charlie Brown with a very long name | 35 | Sales Department | Los Angeles |\n",
]
# Check token counts for each line
print("Token counts:")
for i, line in enumerate(lines, 1):
line_tokens = tokenizer.count_tokens(line)
with_prefix = line_tokens + tokenizer.count_tokens(prefix)
print(f" Line {i}: {line_tokens} tokens (with prefix: {with_prefix} tokens)")
print()
omit_prefix_on_overflow (default behavior)chunker_no_omit = LineBasedTokenChunker(
tokenizer=tokenizer,
prefix=prefix,
omit_prefix_on_overflow=False, # Default: always include prefix
)
chunks_no_omit = chunker_no_omit.chunk_text(lines)
print("=" * 60)
print("WITHOUT omit_prefix_on_overflow (may split long lines)")
print("=" * 60)
print(f"\nTotal chunks: {len(chunks_no_omit)}\n")
for i, chunk in enumerate(chunks_no_omit, 1):
print(f"--- Chunk {i} ---")
print(chunk)
print(f"Tokens: {tokenizer.count_tokens(chunk)}")
print(f"Has prefix: {chunk.startswith(prefix)}\n")
omit_prefix_on_overflow=Truechunker_with_omit = LineBasedTokenChunker(
tokenizer=tokenizer,
prefix=prefix,
omit_prefix_on_overflow=True, # Omit prefix for lines that would overflow
)
chunks_with_omit = chunker_with_omit.chunk_text(lines)
print("=" * 60)
print("WITH omit_prefix_on_overflow (keeps lines intact)")
print("=" * 60)
print(f"\nTotal chunks: {len(chunks_with_omit)}\n")
for i, chunk in enumerate(chunks_with_omit, 1):
print(f"--- Chunk {i} ---")
print(chunk)
print(f"Tokens: {tokenizer.count_tokens(chunk)}")
print(f"Has prefix: {chunk.startswith(prefix)}\n")
The LineBasedTokenChunker can also be used directly with DoclingDocument objects.
from docling.document_converter import DocumentConverter
# Convert a document
converter = DocumentConverter()
result = converter.convert("../../tests/data/md/wiki.md")
doc = result.document
# Create chunker
tokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
max_tokens=100,
)
chunker = LineBasedTokenChunker(
tokenizer=tokenizer,
prefix="", # No prefix for general documents
)
# Chunk the document
chunks = list(chunker.chunk(doc))
print(f"Total chunks: {len(chunks)}\n")
# Display first few chunks
for i, chunk in enumerate(chunks[:3], 1):
print(f"=== Chunk {i} ===")
print(
f"Text: {chunk.text[:200]}..."
if len(chunk.text) > 200
else f"Text: {chunk.text}"
)
print(f"Tokens: {tokenizer.count_tokens(chunk.text)}")
print(f"Doc items: {len(chunk.meta.doc_items)}\n")
When a prefix exceeds the max_tokens limit, it's automatically split into multiple chunks and only included at the beginning.
import warnings
# Create a very long prefix that exceeds max_tokens
tokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
max_tokens=25, # Small limit
)
large_prefix = (
"This is a very long table header that contains a lot of information " * 10
)
print(f"Large prefix token count: {tokenizer.count_tokens(large_prefix)} tokens")
print(f"Max tokens allowed: {tokenizer.get_max_tokens()} tokens\n")
# Create chunker with large prefix - will trigger warning
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
chunker_large = LineBasedTokenChunker(
tokenizer=tokenizer,
prefix=large_prefix,
)
if w:
print("⚠️ Warning issued:")
print(f" {w[0].message}\n")
print(f"Number of prefix chunks: {len(chunker_large.prefix_chunks)}")
print(f"Prefix len (for single chunk): {chunker_large.prefix_len}\n")
# Show the prefix chunks
print("Prefix chunks:")
for i, prefix_chunk in enumerate(chunker_large.prefix_chunks, 1):
preview = prefix_chunk[:100] + "..." if len(prefix_chunk) > 100 else prefix_chunk
print(f" Chunk {i}: {tokenizer.count_tokens(prefix_chunk)} tokens")
print(f" Content: {preview}\n")
# Test chunking with the large prefix
lines = [
"Row 1: Some data here\n",
"Row 2: More data here\n",
"Row 3: Even more data\n",
]
chunks_large = chunker_large.chunk_text(lines)
print(f"Total chunks (including prefix chunks): {len(chunks_large)}")
print(f"Content chunks: {len(chunks_large) - len(chunker_large.prefix_chunks)}\n")
# Display all chunks
for i, chunk in enumerate(chunks_large, 1):
is_prefix_chunk = i <= len(chunker_large.prefix_chunks)
chunk_type = "[PREFIX CHUNK]" if is_prefix_chunk else "[CONTENT CHUNK]"
print(f"Chunk {i} {chunk_type}:")
preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
print(f" Content: {preview}")
print(f" Tokens: {tokenizer.count_tokens(chunk)}\n")
LineBasedTokenChunkeromit_prefix_on_overflow=Trueomit_prefix_on_overflow=False (default)