Back to Docling

Line-Based Token Chunking

docs/examples/line_based_chunking.ipynb

2.92.08.4 KB
Original Source

Line-Based Token Chunking

Overview

The LineBasedTokenChunker is a tokenization-aware chunker that preserves line boundaries. It's particularly useful for structured content like tables, code, or logs where line boundaries are semantically important.

Key features:

  • Line preservation: Keeps entire lines within a single chunk when possible
  • Prefix support: Add repeated context (e.g., table headers) to each chunk
  • Overflow handling: Choose between splitting lines or omitting prefix when lines are too long

Setup

python
%pip install -qU pip docling transformers
python
from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer

Example 1: Basic Table Chunking with Prefix

In this example, we'll chunk a table while repeating the header in each chunk.

python
# Setup tokenizer with a reasonable token limit
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
    max_tokens=50,  # Small limit to demonstrate chunking
)

# Create chunker with table header prefix
chunker = LineBasedTokenChunker(
    tokenizer=tokenizer,
    prefix="| Name | Age | Department |\n|------|-----|------------|\n",
    omit_prefix_on_overflow=False,  # Always include prefix (default)
)

# Sample table rows
lines = [
    "| Alice | 30 | Engineering |\n",
    "| Bob | 25 | Marketing |\n",
    "| Charlie | 35 | Sales |\n",
    "| Diana | 28 | HR |\n",
    "| Eve | 32 | Finance |\n",
]

print(f"Max tokens: {chunker.max_tokens}")
print(f"Prefix token count: {chunker.prefix_len}\n")

chunks = chunker.chunk_text(lines)

print(f"Total chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"=== Chunk {i} ===")
    print(chunk)
    print(f"Tokens: {tokenizer.count_tokens(chunk)}\n")

Example 2: Handling Wide Tables with omit_prefix_on_overflow

When working with wide tables, some rows might fit without the header but not with it. The omit_prefix_on_overflow parameter provides flexibility in these cases.

python
# Setup tokenizer with a very small token limit
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
    max_tokens=30,  # Very small limit to force overflow
)

# Create chunker with a longer prefix
prefix = (
    "| Name | Age | Department | Location |\n|------|-----|------------|----------|\n"
)

print(f"Prefix token count: {tokenizer.count_tokens(prefix)}")
print(f"Max tokens: {tokenizer.get_max_tokens()}\n")

# Sample lines - some will be too long with prefix
lines = [
    "| Alice Johnson | 30 | Engineering | San Francisco |\n",
    "| Bob Smith | 25 | Marketing | New York |\n",
    "| Charlie Brown with a very long name | 35 | Sales Department | Los Angeles |\n",
]

# Check token counts for each line
print("Token counts:")
for i, line in enumerate(lines, 1):
    line_tokens = tokenizer.count_tokens(line)
    with_prefix = line_tokens + tokenizer.count_tokens(prefix)
    print(f"  Line {i}: {line_tokens} tokens (with prefix: {with_prefix} tokens)")
print()

Without omit_prefix_on_overflow (default behavior)

python
chunker_no_omit = LineBasedTokenChunker(
    tokenizer=tokenizer,
    prefix=prefix,
    omit_prefix_on_overflow=False,  # Default: always include prefix
)

chunks_no_omit = chunker_no_omit.chunk_text(lines)

print("=" * 60)
print("WITHOUT omit_prefix_on_overflow (may split long lines)")
print("=" * 60)
print(f"\nTotal chunks: {len(chunks_no_omit)}\n")

for i, chunk in enumerate(chunks_no_omit, 1):
    print(f"--- Chunk {i} ---")
    print(chunk)
    print(f"Tokens: {tokenizer.count_tokens(chunk)}")
    print(f"Has prefix: {chunk.startswith(prefix)}\n")

With omit_prefix_on_overflow=True

python
chunker_with_omit = LineBasedTokenChunker(
    tokenizer=tokenizer,
    prefix=prefix,
    omit_prefix_on_overflow=True,  # Omit prefix for lines that would overflow
)

chunks_with_omit = chunker_with_omit.chunk_text(lines)

print("=" * 60)
print("WITH omit_prefix_on_overflow (keeps lines intact)")
print("=" * 60)
print(f"\nTotal chunks: {len(chunks_with_omit)}\n")

for i, chunk in enumerate(chunks_with_omit, 1):
    print(f"--- Chunk {i} ---")
    print(chunk)
    print(f"Tokens: {tokenizer.count_tokens(chunk)}")
    print(f"Has prefix: {chunk.startswith(prefix)}\n")

Example 3: Chunking a DoclingDocument

The LineBasedTokenChunker can also be used directly with DoclingDocument objects.

python
from docling.document_converter import DocumentConverter

# Convert a document
converter = DocumentConverter()
result = converter.convert("../../tests/data/md/wiki.md")
doc = result.document

# Create chunker
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
    max_tokens=100,
)

chunker = LineBasedTokenChunker(
    tokenizer=tokenizer,
    prefix="",  # No prefix for general documents
)

# Chunk the document
chunks = list(chunker.chunk(doc))

print(f"Total chunks: {len(chunks)}\n")

# Display first few chunks
for i, chunk in enumerate(chunks[:3], 1):
    print(f"=== Chunk {i} ===")
    print(
        f"Text: {chunk.text[:200]}..."
        if len(chunk.text) > 200
        else f"Text: {chunk.text}"
    )
    print(f"Tokens: {tokenizer.count_tokens(chunk.text)}")
    print(f"Doc items: {len(chunk.meta.doc_items)}\n")

Example 4: Handling Large Prefixes

When a prefix exceeds the max_tokens limit, it's automatically split into multiple chunks and only included at the beginning.

python
import warnings

# Create a very long prefix that exceeds max_tokens
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2"),
    max_tokens=25,  # Small limit
)

large_prefix = (
    "This is a very long table header that contains a lot of information " * 10
)

print(f"Large prefix token count: {tokenizer.count_tokens(large_prefix)} tokens")
print(f"Max tokens allowed: {tokenizer.get_max_tokens()} tokens\n")

# Create chunker with large prefix - will trigger warning
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")

    chunker_large = LineBasedTokenChunker(
        tokenizer=tokenizer,
        prefix=large_prefix,
    )

    if w:
        print("⚠️  Warning issued:")
        print(f"   {w[0].message}\n")

print(f"Number of prefix chunks: {len(chunker_large.prefix_chunks)}")
print(f"Prefix len (for single chunk): {chunker_large.prefix_len}\n")

# Show the prefix chunks
print("Prefix chunks:")
for i, prefix_chunk in enumerate(chunker_large.prefix_chunks, 1):
    preview = prefix_chunk[:100] + "..." if len(prefix_chunk) > 100 else prefix_chunk
    print(f"  Chunk {i}: {tokenizer.count_tokens(prefix_chunk)} tokens")
    print(f"  Content: {preview}\n")

# Test chunking with the large prefix
lines = [
    "Row 1: Some data here\n",
    "Row 2: More data here\n",
    "Row 3: Even more data\n",
]

chunks_large = chunker_large.chunk_text(lines)

print(f"Total chunks (including prefix chunks): {len(chunks_large)}")
print(f"Content chunks: {len(chunks_large) - len(chunker_large.prefix_chunks)}\n")

# Display all chunks
for i, chunk in enumerate(chunks_large, 1):
    is_prefix_chunk = i <= len(chunker_large.prefix_chunks)
    chunk_type = "[PREFIX CHUNK]" if is_prefix_chunk else "[CONTENT CHUNK]"

    print(f"Chunk {i} {chunk_type}:")
    preview = chunk[:100] + "..." if len(chunk) > 100 else chunk
    print(f"  Content: {preview}")
    print(f"  Tokens: {tokenizer.count_tokens(chunk)}\n")

Summary

When to use LineBasedTokenChunker

  • You need to preserve line boundaries (tables, code, logs)
  • You want to add context (headers, metadata) to each chunk
  • You're working with structured text where lines have semantic meaning
  • You need fine-grained control over how lines are split

When to use omit_prefix_on_overflow=True

  • Working with wide tables or long prefixes
  • Token budget is limited
  • Line integrity is more important than consistent formatting
  • You can handle chunks without the prefix in downstream processing

When to use omit_prefix_on_overflow=False (default)

  • You need the prefix in every chunk for context
  • Consistent formatting is critical
  • Downstream processing requires the prefix to understand the content
  • Working with narrow content where the prefix doesn't cause overflow