docs/examples/pictures_description.ipynb
<a href="https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/pictures_description.ipynb" target="_parent"></a>
%pip install -q docling[vlm] ipython
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# The source document
DOC_SOURCE = "https://arxiv.org/pdf/2501.17887"
This section will run locally the ibm-granite/granite-vision-3.1-2b-preview model to describe the pictures of the document.
from docling.datamodel.pipeline_options import granite_picture_description
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = (
granite_picture_description # <-- the model choice
)
pipeline_options.picture_description_options.prompt = (
"Describe the image in three sentences. Be consise and accurate."
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
doc = converter.convert(DOC_SOURCE).document
from docling_core.types.doc.document import PictureDescriptionData
from IPython import display
html_buffer = []
# display the first 5 pictures and their captions and annotations:
for pic in doc.pictures[:5]:
html_item = (
f"<h3>Picture <code>{pic.self_ref}</code></h3>"
f'
'
f"<h4>Caption</h4>{pic.caption_text(doc=doc)}
"
)
if pic.meta is not None and pic.meta.description is not None:
html_item += f"<h4>Annotations ({pic.meta.description.created_by})</h4>{pic.meta.description.text}
\n"
html_buffer.append(html_item)
display.HTML("<hr />".join(html_buffer))
This section will run locally the HuggingFaceTB/SmolVLM-256M-Instruct model to describe the pictures of the document.
from docling.datamodel.pipeline_options import smolvlm_picture_description
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = (
smolvlm_picture_description # <-- the model choice
)
pipeline_options.picture_description_options.prompt = (
"Describe the image in three sentences. Be consise and accurate."
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
doc = converter.convert(DOC_SOURCE).document
from docling_core.types.doc.document import PictureDescriptionData
from IPython import display
html_buffer = []
# display the first 5 pictures and their captions and annotations:
for pic in doc.pictures[:5]:
html_item = (
f"<h3>Picture <code>{pic.self_ref}</code></h3>"
f'
'
f"<h4>Caption</h4>{pic.caption_text(doc=doc)}
"
)
if pic.meta is not None and pic.meta.description is not None:
html_item += f"<h4>Annotations ({pic.meta.description.created_by})</h4>{pic.meta.description.text}
\n"
html_buffer.append(html_item)
display.HTML("<hr />".join(html_buffer))
The examples above can also be reproduced using other vision model.
The Docling options PictureDescriptionVlmOptions allows to specify your favorite vision model from the Hugging Face Hub.
from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
repo_id="", # <-- add here the Hugging Face repo_id of your favorite VLM
prompt="Describe the image in three sentences. Be consise and accurate.",
)
pipeline_options.images_scale = 2.0
pipeline_options.generate_picture_images = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
# Uncomment to run:
# doc = converter.convert(DOC_SOURCE).document