docs/examples/advanced_chunking_and_serialization.ipynb
In this notebook we show how to customize the serialization strategies that come into play during chunking.
We will work with a document that contains some picture annotations:
from docling_core.types.doc.document import DoclingDocument
SOURCE = "./data/2408.09869v3_enriched.json"
doc = DoclingDocument.load_from_json(SOURCE)
Below we define the chunker (for more details check out Hybrid Chunking):
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer: BaseTokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID),
)
chunker = HybridChunker(tokenizer=tokenizer)
print(f"{tokenizer.get_max_tokens()=}")
Defining some helper methods:
from typing import Iterable, Optional
from docling_core.transforms.chunker.base import BaseChunk
from docling_core.transforms.chunker.hierarchical_chunker import DocChunk
from docling_core.types.doc.labels import DocItemLabel
from rich.console import Console
from rich.panel import Panel
console = Console(
width=200, # for getting Markdown tables rendered nicely
)
def find_n_th_chunk_with_label(
iter: Iterable[BaseChunk], n: int, label: DocItemLabel
) -> Optional[DocChunk]:
num_found = -1
for i, chunk in enumerate(iter):
doc_chunk = DocChunk.model_validate(chunk)
for it in doc_chunk.meta.doc_items:
if it.label == label:
num_found += 1
if num_found == n:
return i, chunk
return None, None
def print_chunk(chunks, chunk_pos):
chunk = chunks[chunk_pos]
ctx_text = chunker.contextualize(chunk=chunk)
num_tokens = tokenizer.count_tokens(text=ctx_text)
doc_items_refs = [it.self_ref for it in chunk.meta.doc_items]
title = f"{chunk_pos=} {num_tokens=} {doc_items_refs=}"
console.print(Panel(ctx_text, title=title))
Below we inspect the first chunk containing a table — using the default serialization strategy:
chunker = HybridChunker(tokenizer=tokenizer)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)
print_chunk(
chunks=chunks,
chunk_pos=i,
)
We can configure a different serialization strategy. In the example below, we specify a different table serializer that serializes tables to Markdown instead of the triplet notation used by default:
from docling_core.transforms.chunker.hierarchical_chunker import (
ChunkingDocSerializer,
ChunkingSerializerProvider,
)
from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
class MDTableSerializerProvider(ChunkingSerializerProvider):
def get_serializer(self, doc):
return ChunkingDocSerializer(
doc=doc,
table_serializer=MarkdownTableSerializer(), # configuring a different table serializer
)
chunker = HybridChunker(
tokenizer=tokenizer,
serializer_provider=MDTableSerializerProvider(),
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)
print_chunk(
chunks=chunks,
chunk_pos=i,
)
Below we inspect the first chunk containing a picture.
Even when using the default strategy, we can modify the relevant parameters, e.g. which placeholder is used for pictures:
from docling_core.transforms.serializer.markdown import MarkdownParams
class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):
def get_serializer(self, doc):
return ChunkingDocSerializer(
doc=doc,
params=MarkdownParams(
image_placeholder="<!-- image -->",
),
)
chunker = HybridChunker(
tokenizer=tokenizer,
serializer_provider=ImgPlaceholderSerializerProvider(),
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)
print_chunk(
chunks=chunks,
chunk_pos=i,
)
Below we define and use our custom picture serialization strategy which leverages picture annotations:
from typing import Any
from docling_core.transforms.serializer.base import (
BaseDocSerializer,
SerializationResult,
)
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.transforms.serializer.markdown import MarkdownPictureSerializer
from docling_core.types.doc.document import (
PictureClassificationData,
PictureDescriptionData,
PictureItem,
PictureMoleculeData,
)
from typing_extensions import override
class AnnotationPictureSerializer(MarkdownPictureSerializer):
@override
def serialize(
self,
*,
item: PictureItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
text_parts: list[str] = []
if item.meta is not None:
if item.meta.classification is not None:
main_pred = item.meta.classification.get_main_prediction()
if main_pred is not None:
text_parts.append(f"Picture type: {main_pred.class_name}")
if item.meta.molecule is not None:
text_parts.append(f"SMILES: {item.meta.molecule.smi}")
if item.meta.description is not None:
text_parts.append(f"Picture description: {item.meta.description.text}")
text_res = "\n".join(text_parts)
text_res = doc_serializer.post_process(text=text_res)
return create_ser_result(text=text_res, span_source=item)
class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):
def get_serializer(self, doc: DoclingDocument):
return ChunkingDocSerializer(
doc=doc,
picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer
)
chunker = HybridChunker(
tokenizer=tokenizer,
serializer_provider=ImgAnnotationSerializerProvider(),
)
chunk_iter = chunker.chunk(dl_doc=doc)
chunks = list(chunk_iter)
i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)
print_chunk(
chunks=chunks,
chunk_pos=i,
)
In this section, we demonstrate how to expand chunks to include additional context from their containing document items or pages. This is useful when we want to ensure that chunks include complete semantic units or when we need more context for downstream tasks.
We can expand a chunk to include the full content of its containing document item. This ensures that the chunk contains the complete semantic unit (e.g., a full paragraph, section, list, or table) rather than a truncated portion.
from docling_core.transforms.chunker.chunk_expander import TreeChunkExpander
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer
# Create a chunk expander for expanding to containing doc items
tree_expander = TreeChunkExpander()
serializer = MDTableSerializerProvider().get_serializer(doc=doc)
# Reuse the chunks from the previous table serialization example
# Find a chunk that contains a table (reusing the variable 'i' from earlier)
table_chunk_idx, table_chunk = find_n_th_chunk_with_label(
chunks, n=0, label=DocItemLabel.TABLE
)
# Expand the chunk to include the full containing doc item (complete table)
expanded_chunk = tree_expander.expand(
chunk=table_chunk, dl_doc=doc, serializer=serializer
)
# Compare original and expanded chunks
print("Original chunk (partial table):")
print_chunk(chunks=chunks, chunk_pos=table_chunk_idx)
print("\nExpanded chunk (complete table in containing doc item):")
ctx_text = chunker.contextualize(chunk=expanded_chunk)
num_tokens = tokenizer.count_tokens(text=ctx_text)
title = f"chunk_pos={table_chunk_idx} (expanded) {num_tokens=}"
console.print(Panel(ctx_text, title=title))
We can also expand a chunk to include all content from its containing page. This is particularly useful when we need full page context for tasks like question answering or when working with documents where page boundaries are semantically important.
from docling_core.transforms.chunker.chunk_expander import PageChunkExpander
# Create a chunk expander for expanding to containing pages
page_expander = PageChunkExpander()
# Reuse the table chunk from the previous example
# Expand it to include all content from the containing page
expanded_chunk = page_expander.expand(
chunk=table_chunk, dl_doc=doc, serializer=serializer
)
# Compare original and expanded chunks
print("Original chunk (partial table):")
print_chunk(chunks=chunks, chunk_pos=table_chunk_idx)
print("\nExpanded chunk (full page containing the table):")
ctx_text = chunker.contextualize(chunk=expanded_chunk)
num_tokens = tokenizer.count_tokens(text=ctx_text)
title = f"chunk_pos={table_chunk_idx} (expanded to page) {num_tokens=}"
console.print(Panel(ctx_text, title=title))