docs/examples/rag_langchain.ipynb
<a href="https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/rag_langchain.ipynb" target="_parent"></a>
| Step | Tech | Execution |
|---|---|---|
| Embedding | Hugging Face / Sentence Transformers | 💻 Local |
| Vector store | Milvus | 💻 Local |
| Gen AI | Hugging Face Inference API | 🌐 Remote |
This example leverages the LangChain Docling integration, along with a Milvus vector store, as well as sentence-transformers embeddings.
The presented DoclingLoader component enables you to:
DoclingLoader supports two different export modes:
ExportType.MARKDOWN: if you want to capture each input document as a separate
LangChain document, orExportType.DOC_CHUNKS (default): if you want to have each input document chunked and
to then capture each individual chunk as a separate LangChain document downstream.The example allows exploring both modes via parameter EXPORT_TYPE; depending on the
value set, the example pipeline is then set up accordingly.
HF_TOKEN.--no-warn-conflicts meant for Colab's pre-populated Python env; feel free to remove for stricter usage):%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv
import os
from pathlib import Path
from tempfile import mkdtemp
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_docling.loader import ExportType
def _get_env_from_colab_or_os(key):
try:
from google.colab import userdata
try:
return userdata.get(key)
except userdata.SecretNotFoundError:
pass
except ImportError:
pass
return os.getenv(key)
load_dotenv()
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
HF_TOKEN = _get_env_from_colab_or_os("HF_TOKEN")
FILE_PATH = ["https://arxiv.org/pdf/2408.09869"] # Docling Technical Report
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
EXPORT_TYPE = ExportType.DOC_CHUNKS
QUESTION = "Which are the main AI models in Docling?"
PROMPT = PromptTemplate.from_template(
"Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n",
)
TOP_K = 3
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")
Now we can instantiate our loader and load documents.
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from langchain_docling import DoclingLoader
from transformers import AutoTokenizer
from docling.chunking import HybridChunker
tokenizer = HuggingFaceTokenizer(
tokenizer=AutoTokenizer.from_pretrained(EMBED_MODEL_ID)
)
loader = DoclingLoader(
file_path=FILE_PATH,
export_type=EXPORT_TYPE,
chunker=HybridChunker(tokenizer=tokenizer),
)
docs = loader.load()
Note: a message saying
"Token indices sequence length is longer than the specified maximum sequence length..."can be ignored in this case — details here.
Determining the splits:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
splits = docs
elif EXPORT_TYPE == ExportType.MARKDOWN:
from langchain_text_splitters import MarkdownHeaderTextSplitter
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("#", "Header_1"),
("##", "Header_2"),
("###", "Header_3"),
],
)
splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]
else:
raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")
Inspecting some sample splits:
for d in splits[:3]:
print(f"- {d.page_content=}")
print("...")
import json
from pathlib import Path
from tempfile import mkdtemp
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_milvus import Milvus
embedding = HuggingFaceEmbeddings(model_name=EMBED_MODEL_ID)
milvus_uri = str(Path(mkdtemp()) / "docling.db") # or set as needed
vectorstore = Milvus.from_documents(
documents=splits,
embedding=embedding,
collection_name="docling_demo",
connection_args={"uri": milvus_uri},
index_params={"index_type": "FLAT"},
drop_old=True,
)
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEndpoint
retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
llm = HuggingFaceEndpoint(
repo_id=GEN_MODEL_ID,
huggingfacehub_api_token=HF_TOKEN,
task="text-generation",
)
def clip_text(text, threshold=100):
return f"{text[:threshold]}..." if len(text) > threshold else text
question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
resp_dict = rag_chain.invoke({"input": QUESTION})
clipped_answer = clip_text(resp_dict["answer"], threshold=200)
print(f"Question:\n{resp_dict['input']}\n\nAnswer:\n{clipped_answer}")
for i, doc in enumerate(resp_dict["context"]):
print()
print(f"Source {i + 1}:")
print(f" text: {json.dumps(clip_text(doc.page_content, threshold=350))}")
for key in doc.metadata:
if key != "pk":
val = doc.metadata.get(key)
clipped_val = clip_text(val) if isinstance(val, str) else val
print(f" {key}: {clipped_val}")