Back to Docling

Advanced chunking & serialization

docs/examples/advanced_chunking_and_serialization.ipynb

2.96.09.9 KB
Original Source

Advanced chunking & serialization

Overview

In this notebook we show how to customize the serialization strategies that come into play during chunking.

Setup

We will work with a document that contains some picture annotations:

python
from docling_core.types.doc.document import DoclingDocument

SOURCE = "./data/2408.09869v3_enriched.json"

doc = DoclingDocument.load_from_json(SOURCE)

Below we define the chunker (for more details check out Hybrid Chunking):

python
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer

EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer: BaseTokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
)
chunker = HybridChunker(tokenizer=tokenizer)
python
print(f"{tokenizer.get_max_tokens()=}")

Defining some helper methods:

python
from typing import Iterable, Optional

from docling_core.transforms.chunker.base import BaseChunk
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
from docling_core.types.doc.labels import DocItemLabel
from rich.console import Console
from rich.panel import Panel

console = Console(
    width=200,  # for getting Markdown tables rendered nicely
)


def find_n_th_chunk_with_label(
    iter: Iterable[BaseChunk], n: int, label: DocItemLabel
) -> Optional[DocChunk]:
    num_found = -1
    for i, chunk in enumerate(iter):
        doc_chunk = DocChunk.model_validate(chunk)
        for it in doc_chunk.meta.doc_items:
            if it.label == label:
                num_found += 1
                if num_found == n:
                    return i, chunk
    return None, None


def print_chunk(chunks, chunk_pos):
    chunk = chunks[chunk_pos]
    ctx_text = chunker.contextualize(chunk=chunk)
    num_tokens = tokenizer.count_tokens(text=ctx_text)
    doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]
    title = f"{chunk_pos=} {num_tokens=} {doc_items_refs=}"
    console.print(Panel(ctx_text, title=title))

Table serialization

Using the default strategy

Below we inspect the first chunk containing a table — using the default serialization strategy:

python
chunker = HybridChunker(tokenizer=tokenizer)

chunk_iter = chunker.chunk(dl_doc=doc)

chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)
print_chunk(
    chunks=chunks,
    chunk_pos=i,
)
<div class="alert alert-info"> <strong>INFO</strong>: As you see above, using the <code>HybridChunker</code> can sometimes lead to a warning from the transformers library, however this is a "false alarm" — for details check <a href="https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model">here</a>. </div>

Configuring a different strategy

We can configure a different serialization strategy. In the example below, we specify a different table serializer that serializes tables to Markdown instead of the triplet notation used by default:

python
from docling_core.transforms.chunker.hierarchical_chunker import (
    ChunkingDocSerializer,
    ChunkingSerializerProvider,
)
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer


class MDTableSerializerProvider(ChunkingSerializerProvider):
    def get_serializer(self, doc):
        return ChunkingDocSerializer(
            doc=doc,
            table_serializer=MarkdownTableSerializer(),  # configuring a different table serializer
        )


chunker = HybridChunker(
    tokenizer=tokenizer,
    serializer_provider=MDTableSerializerProvider(),
)

chunk_iter = chunker.chunk(dl_doc=doc)

chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)
print_chunk(
    chunks=chunks,
    chunk_pos=i,
)

Picture serialization

Using the default strategy

Below we inspect the first chunk containing a picture.

Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:

python
from docling_core.transforms.serializer.markdown import MarkdownParams


class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):
    def get_serializer(self, doc):
        return ChunkingDocSerializer(
            doc=doc,
            params=MarkdownParams(
                image_placeholder="<!-- image -->",
            ),
        )


chunker = HybridChunker(
    tokenizer=tokenizer,
    serializer_provider=ImgPlaceholderSerializerProvider(),
)

chunk_iter = chunker.chunk(dl_doc=doc)

chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)
print_chunk(
    chunks=chunks,
    chunk_pos=i,
)

Using a custom strategy

Below we define and use our custom picture serialization strategy which leverages picture annotations:

python
from typing import Any

from docling_core.transforms.serializer.base import (
    BaseDocSerializer,
    SerializationResult,
)
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer
from docling_core.types.doc.document import (
    PictureClassificationData,
    PictureDescriptionData,
    PictureItem,
    PictureMoleculeData,
)
from typing_extensions import override


class AnnotationPictureSerializer(MarkdownPictureSerializer):
    @override
    def serialize(
        self,
        *,
        item: PictureItem,
        doc_serializer: BaseDocSerializer,
        doc: DoclingDocument,
        **kwargs: Any,
    ) -> SerializationResult:
        text_parts: list[str] = []

        if item.meta is not None:
            if item.meta.classification is not None:
                main_pred = item.meta.classification.get_main_prediction()
                if main_pred is not None:
                    text_parts.append(f"Picture type: {main_pred.class_name}")

            if item.meta.molecule is not None:
                text_parts.append(f"SMILES: {item.meta.molecule.smi}")

            if item.meta.description is not None:
                text_parts.append(f"Picture description: {item.meta.description.text}")

        text_res = "\n".join(text_parts)
        text_res = doc_serializer.post_process(text=text_res)
        return create_ser_result(text=text_res, span_source=item)
python
class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):
    def get_serializer(self, doc: DoclingDocument):
        return ChunkingDocSerializer(
            doc=doc,
            picture_serializer=AnnotationPictureSerializer(),  # configuring a different picture serializer
        )


chunker = HybridChunker(
    tokenizer=tokenizer,
    serializer_provider=ImgAnnotationSerializerProvider(),
)

chunk_iter = chunker.chunk(dl_doc=doc)

chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)
print_chunk(
    chunks=chunks,
    chunk_pos=i,
)

Chunk expansion

In this section, we demonstrate how to expand chunks to include additional context from their containing document items or pages. This is useful when we want to ensure that chunks include complete semantic units or when we need more context for downstream tasks.

Expansion to containing DocItem

We can expand a chunk to include the full content of its containing document item. This ensures that the chunk contains the complete semantic unit (e.g., a full paragraph, section, list, or table) rather than a truncated portion.

python
from docling_core.transforms.chunker.chunk_expander import TreeChunkExpander
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer

# Create a chunk expander for expanding to containing doc items
tree_expander = TreeChunkExpander()
serializer = MDTableSerializerProvider().get_serializer(doc=doc)

# Reuse the chunks from the previous table serialization example
# Find a chunk that contains a table (reusing the variable 'i' from earlier)
table_chunk_idx, table_chunk = find_n_th_chunk_with_label(
    chunks, n=0, label=DocItemLabel.TABLE
)

# Expand the chunk to include the full containing doc item (complete table)
expanded_chunk = tree_expander.expand(
    chunk=table_chunk, dl_doc=doc, serializer=serializer
)

# Compare original and expanded chunks
print("Original chunk (partial table):")
print_chunk(chunks=chunks, chunk_pos=table_chunk_idx)

print("\nExpanded chunk (complete table in containing doc item):")
ctx_text = chunker.contextualize(chunk=expanded_chunk)
num_tokens = tokenizer.count_tokens(text=ctx_text)
title = f"chunk_pos={table_chunk_idx} (expanded) {num_tokens=}"
console.print(Panel(ctx_text, title=title))

Expansion to containing page

We can also expand a chunk to include all content from its containing page. This is particularly useful when we need full page context for tasks like question answering or when working with documents where page boundaries are semantically important.

python
from docling_core.transforms.chunker.chunk_expander import PageChunkExpander

# Create a chunk expander for expanding to containing pages
page_expander = PageChunkExpander()

# Reuse the table chunk from the previous example
# Expand it to include all content from the containing page
expanded_chunk = page_expander.expand(
    chunk=table_chunk, dl_doc=doc, serializer=serializer
)

# Compare original and expanded chunks
print("Original chunk (partial table):")
print_chunk(chunks=chunks, chunk_pos=table_chunk_idx)

print("\nExpanded chunk (full page containing the table):")
ctx_text = chunker.contextualize(chunk=expanded_chunk)
num_tokens = tokenizer.count_tokens(text=ctx_text)
title = f"chunk_pos={table_chunk_idx} (expanded to page) {num_tokens=}"
console.print(Panel(ctx_text, title=title))