docs/examples/rag_haystack.ipynb
<a href="https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/rag_haystack.ipynb" target="_parent"></a>
| Step | Tech | Execution |
|---|---|---|
| Embedding | Hugging Face / Sentence Transformers | 💻 Local |
| Vector store | Milvus | 💻 Local |
| Gen AI | Hugging Face Inference API | 🌐 Remote |
This example leverages the Haystack Docling extension, along with Milvus-based document store and retriever instances, as well as sentence-transformers embeddings.
The presented DoclingConverter component enables you to:
DoclingConverter supports two different export modes:
ExportType.MARKDOWN: if you want to capture each input document as a separate
Haystack document, orExportType.DOC_CHUNKS (default): if you want to have each input document chunked and
to then capture each individual chunk as a separate Haystack document downstream.The example allows to explore both modes via parameter EXPORT_TYPE; depending on the
value set, the ingestion and RAG pipelines are then set up accordingly.
HF_TOKEN.--no-warn-conflicts meant for Colab's pre-populated Python env; feel free to remove for stricter usage):%pip install -q --progress-bar off --no-warn-conflicts docling-haystack haystack-ai docling "pymilvus[milvus-lite]" milvus-haystack sentence-transformers python-dotenv
import os
from pathlib import Path
from tempfile import mkdtemp
from docling_haystack.converter import ExportType
from dotenv import load_dotenv
def _get_env_from_colab_or_os(key):
try:
from google.colab import userdata
try:
return userdata.get(key)
except userdata.SecretNotFoundError:
pass
except ImportError:
pass
return os.getenv(key)
load_dotenv()
HF_TOKEN = _get_env_from_colab_or_os("HF_TOKEN")
PATHS = ["https://arxiv.org/pdf/2408.09869"] # Docling Technical Report
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
GENERATION_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
EXPORT_TYPE = ExportType.DOC_CHUNKS
QUESTION = "Which are the main AI models in Docling?"
TOP_K = 3
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")
from docling_haystack.converter import DoclingConverter
from haystack import Pipeline
from haystack.components.embedders import (
SentenceTransformersDocumentEmbedder,
SentenceTransformersTextEmbedder,
)
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from milvus_haystack import MilvusDocumentStore, MilvusEmbeddingRetriever
from docling.chunking import HybridChunker
document_store = MilvusDocumentStore(
connection_args={"uri": MILVUS_URI},
drop_old=True,
text_field="txt", # set for preventing conflict with same-name metadata field
)
idx_pipe = Pipeline()
idx_pipe.add_component(
"converter",
DoclingConverter(
export_type=EXPORT_TYPE,
chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
),
)
idx_pipe.add_component(
"embedder",
SentenceTransformersDocumentEmbedder(model=EMBED_MODEL_ID),
)
idx_pipe.add_component("writer", DocumentWriter(document_store=document_store))
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
idx_pipe.connect("converter", "embedder")
elif EXPORT_TYPE == ExportType.MARKDOWN:
idx_pipe.add_component(
"splitter",
DocumentSplitter(split_by="sentence", split_length=1),
)
idx_pipe.connect("converter.documents", "splitter.documents")
idx_pipe.connect("splitter.documents", "embedder.documents")
else:
raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")
idx_pipe.connect("embedder", "writer")
idx_pipe.run({"converter": {"paths": PATHS}})
from haystack.components.builders import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret
prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
Question: {{query}}
Answer:
"""
rag_pipe = Pipeline()
rag_pipe.add_component(
"embedder",
SentenceTransformersTextEmbedder(model=EMBED_MODEL_ID),
)
rag_pipe.add_component(
"retriever",
MilvusEmbeddingRetriever(document_store=document_store, top_k=TOP_K),
)
rag_pipe.add_component("prompt_builder", PromptBuilder(template=prompt_template))
rag_pipe.add_component(
"llm",
HuggingFaceAPIGenerator(
api_type="serverless_inference_api",
api_params={"model": GENERATION_MODEL_ID},
token=Secret.from_token(HF_TOKEN) if HF_TOKEN else None,
),
)
rag_pipe.add_component("answer_builder", AnswerBuilder())
rag_pipe.connect("embedder.embedding", "retriever")
rag_pipe.connect("retriever", "prompt_builder.documents")
rag_pipe.connect("prompt_builder", "llm")
rag_pipe.connect("llm.replies", "answer_builder.replies")
rag_pipe.connect("llm.meta", "answer_builder.meta")
rag_pipe.connect("retriever", "answer_builder.documents")
rag_res = rag_pipe.run(
{
"embedder": {"text": QUESTION},
"prompt_builder": {"query": QUESTION},
"answer_builder": {"query": QUESTION},
}
)
Below we print out the RAG results. If you have used ExportType.DOC_CHUNKS, notice how
the sources contain document-level grounding (e.g. page number or bounding box
information):
from docling.chunking import DocChunk
print(f"Question:\n{QUESTION}\n")
print(f"Answer:\n{rag_res['answer_builder']['answers'][0].data.strip()}\n")
print("Sources:")
sources = rag_res["answer_builder"]["answers"][0].documents
for source in sources:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
doc_chunk = DocChunk.model_validate(source.meta["dl_meta"])
print(f"- text: {doc_chunk.text!r}")
if doc_chunk.meta.origin:
print(f" file: {doc_chunk.meta.origin.filename}")
if doc_chunk.meta.headings:
print(f" section: {' / '.join(doc_chunk.meta.headings)}")
bbox = doc_chunk.meta.doc_items[0].prov[0].bbox
print(
f" page: {doc_chunk.meta.doc_items[0].prov[0].page_no}, "
f"bounding box: [{int(bbox.l)}, {int(bbox.t)}, {int(bbox.r)}, {int(bbox.b)}]"
)
elif EXPORT_TYPE == ExportType.MARKDOWN:
print(repr(source.content))
else:
raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")