docs/examples/visual_grounding.ipynb
<a href="https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/visual_grounding.ipynb" target="_parent"></a>
| Step | Tech | Execution |
|---|---|---|
| Embedding | Hugging Face / Sentence Transformers | 💻 Local |
| Vector store | Milvus | 💻 Local |
| Gen AI | Hugging Face Inference API | 🌐 Remote |
This example showcases Docling's visual grounding capabilities, which can be combined with any agentic AI / RAG framework.
In this instance, we illustrate these capabilities leveraging the LangChain Docling integration, along with a Milvus vector store, as well as sentence-transformers embeddings.
HF_TOKEN.--no-warn-conflicts meant for Colab's pre-populated Python env; feel free to remove for stricter usage):%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain matplotlib python-dotenv
import os
from pathlib import Path
from tempfile import mkdtemp
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_docling.loader import ExportType
def _get_env_from_colab_or_os(key):
try:
from google.colab import userdata
try:
return userdata.get(key)
except userdata.SecretNotFoundError:
pass
except ImportError:
pass
return os.getenv(key)
load_dotenv()
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
HF_TOKEN = _get_env_from_colab_or_os("HF_TOKEN")
SOURCES = ["https://arxiv.org/pdf/2408.09869"] # Docling Technical Report
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
QUESTION = "Which are the main AI models in Docling?"
PROMPT = PromptTemplate.from_template(
"Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n",
)
TOP_K = 3
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")
We first define our converter, in this case including options for keeping page images (for visual grounding).
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=PdfPipelineOptions(
generate_page_images=True,
images_scale=2.0,
),
)
}
)
We set up a simple doc store for keeping converted documents, as that is needed for visual grounding further below.
doc_store = {}
doc_store_root = Path(mkdtemp())
for source in SOURCES:
dl_doc = converter.convert(source=source).document
file_path = Path(doc_store_root / f"{dl_doc.origin.binary_hash}.json")
dl_doc.save_as_json(file_path)
doc_store[dl_doc.origin.binary_hash] = file_path
Now we can instantiate our loader and load documents.
from langchain_docling import DoclingLoader
from docling.chunking import HybridChunker
loader = DoclingLoader(
file_path=SOURCES,
converter=converter,
export_type=ExportType.DOC_CHUNKS,
chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)
docs = loader.load()
👉 NOTE: As you see above, using the
HybridChunkercan sometimes lead to a warning from the transformers library, however this is a "false alarm" — for details check here.
Inspecting some sample splits:
for d in docs[:3]:
print(f"- {d.page_content=}")
print("...")
import json
from pathlib import Path
from tempfile import mkdtemp
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_milvus import Milvus
embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)
milvus_uri = str(Path(mkdtemp()) / "docling.db") # or set as needed
vectorstore = Milvus.from_documents(
documents=docs,
embedding=embedding,
collection_name="docling_demo",
connection_args={"uri": milvus_uri},
index_params={"index_type": "FLAT"},
drop_old=True,
)
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEndpoint
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
llm = HuggingFaceEndpoint(
repo_id=GEN_MODEL_ID,
huggingfacehub_api_token=HF_TOKEN,
task="text-generation",
)
def clip_text(text, threshold=100):
return f"{text[:threshold]}..." if len(text) > threshold else text
from docling.chunking import DocMeta
from docling.datamodel.document import DoclingDocument
question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
resp_dict = rag_chain.invoke({"input": QUESTION})
clipped_answer = clip_text(resp_dict["answer"], threshold=200)
print(f"Question:\n{resp_dict['input']}\n\nAnswer:\n{clipped_answer}")
import matplotlib.pyplot as plt
from PIL import ImageDraw
for i, doc in enumerate(resp_dict["context"][:]):
image_by_page = {}
print(f"Source {i + 1}:")
print(f" text: {json.dumps(clip_text(doc.page_content, threshold=350))}")
meta = DocMeta.model_validate(doc.metadata["dl_meta"])
# loading the full DoclingDocument from the document store:
dl_doc = DoclingDocument.load_from_json(doc_store.get(meta.origin.binary_hash))
for doc_item in meta.doc_items:
if doc_item.prov:
prov = doc_item.prov[0] # here we only consider the first provenence item
page_no = prov.page_no
if img := image_by_page.get(page_no):
pass
else:
page = dl_doc.pages[prov.page_no]
print(f" page: {prov.page_no}")
img = page.image.pil_image
image_by_page[page_no] = img
bbox = prov.bbox.to_top_left_origin(page_height=page.size.height)
bbox = bbox.normalized(page.size)
thickness = 2
padding = thickness + 2
bbox.l = round(bbox.l * img.width - padding)
bbox.r = round(bbox.r * img.width + padding)
bbox.t = round(bbox.t * img.height - padding)
bbox.b = round(bbox.b * img.height + padding)
draw = ImageDraw.Draw(img)
draw.rectangle(
xy=bbox.as_tuple(),
outline="blue",
width=thickness,
)
for p in image_by_page:
img = image_by_page[p]
plt.figure(figsize=[15, 15])
plt.imshow(img)
plt.axis("off")
plt.show()