The source document - Docling

python

%pip install -q docling[vlm] ipython

python

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

python

# The source document
DOC_SOURCE = "https://arxiv.org/pdf/2501.17887"

Describe pictures with Granite Vision

This section will run locally the ibm-granite/granite-vision-3.1-2b-preview model to describe the pictures of the document.

python

from docling.datamodel.pipeline_options import granite_picture_description

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = (
    granite_picture_description  # <-- the model choice
)
pipeline_options.picture_description_options.prompt = (
    "Describe the image in three sentences. Be consise and accurate."
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)
doc = converter.convert(DOC_SOURCE).document

python

from docling_core.types.doc.document import PictureDescriptionData
from IPython import display

html_buffer = []
# display the first 5 pictures and their captions and annotations:
for pic in doc.pictures[:5]:
    html_item = (
        f"<h3>Picture <code>{pic.self_ref}</code></h3>"
        f'
'
        f"<h4>Caption</h4>{pic.caption_text(doc=doc)}
"
    )
    if pic.meta is not None and pic.meta.description is not None:
        html_item += f"<h4>Annotations ({pic.meta.description.created_by})</h4>{pic.meta.description.text}
\n"
    html_buffer.append(html_item)
display.HTML("<hr />".join(html_buffer))

Describe pictures with SmolVLM

This section will run locally the HuggingFaceTB/SmolVLM-256M-Instruct model to describe the pictures of the document.

python

from docling.datamodel.pipeline_options import smolvlm_picture_description

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = (
    smolvlm_picture_description  # <-- the model choice
)
pipeline_options.picture_description_options.prompt = (
    "Describe the image in three sentences. Be consise and accurate."
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)
doc = converter.convert(DOC_SOURCE).document

python

from docling_core.types.doc.document import PictureDescriptionData
from IPython import display

html_buffer = []
# display the first 5 pictures and their captions and annotations:
for pic in doc.pictures[:5]:
    html_item = (
        f"<h3>Picture <code>{pic.self_ref}</code></h3>"
        f'
'
        f"<h4>Caption</h4>{pic.caption_text(doc=doc)}
"
    )
    if pic.meta is not None and pic.meta.description is not None:
        html_item += f"<h4>Annotations ({pic.meta.description.created_by})</h4>{pic.meta.description.text}
\n"
    html_buffer.append(html_item)
display.HTML("<hr />".join(html_buffer))

Use other vision models

The examples above can also be reproduced using other vision model. The Docling options PictureDescriptionVlmOptions allows to specify your favorite vision model from the Hugging Face Hub.

python

from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions

pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
    repo_id="",  # <-- add here the Hugging Face repo_id of your favorite VLM
    prompt="Describe the image in three sentences. Be consise and accurate.",
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
        )
    }
)

# Uncomment to run:
# doc = converter.convert(DOC_SOURCE).document