docs/examples/rag_azuresearch.ipynb
<a href="https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_azuresearch.ipynb" target="_parent"></a>
| Step | Tech | Execution |
|---|---|---|
| Embedding | Azure OpenAI | 🌐 Remote |
| Vector Store | Azure AI Search | 🌐 Remote |
| Gen AI | Azure OpenAI | 🌐 Remote |
This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:
This sample demonstrates how to:
# If running in a fresh environment (like Google Colab), uncomment and run this single command:
%pip install "docling~=2.12" azure-search-documents==11.5.2 azure-identity openai rich torch python-dotenv
Azure AI Search resource
Azure OpenAI resource with a deployed embedding and chat completion model (e.g. text-embedding-3-small and gpt-4o)
Docling 2.12+ (installs docling_core automatically) Docling installed (Python 3.8+ environment)
A GPU-enabled environment is preferred for faster parsing. Docling 2.12 automatically detects GPU if present.
import os
from dotenv import load_dotenv
load_dotenv()
def _get_env(key, default=None):
try:
from google.colab import userdata
try:
return userdata.get(key)
except userdata.SecretNotFoundError:
pass
except ImportError:
pass
return os.getenv(key, default)
AZURE_SEARCH_ENDPOINT = _get_env("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = _get_env("AZURE_SEARCH_KEY") # Ensure this is your Admin Key
AZURE_SEARCH_INDEX_NAME = _get_env("AZURE_SEARCH_INDEX_NAME", "docling-rag-sample")
AZURE_OPENAI_ENDPOINT = _get_env("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = _get_env("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_API_VERSION = _get_env("AZURE_OPENAI_API_VERSION", "2024-10-21")
AZURE_OPENAI_CHAT_MODEL = _get_env(
"AZURE_OPENAI_CHAT_MODEL"
) # Using a deployed model named "gpt-4o"
AZURE_OPENAI_EMBEDDINGS = _get_env(
"AZURE_OPENAI_EMBEDDINGS", "text-embedding-3-small"
) # Using a deployed model named "text-embeddings-3-small"
We’ll parse the Microsoft GraphRAG Research Paper (~15 pages). Parsing should be relatively quick, even on CPU, but it will be faster on a GPU or MPS device if available.
(If you prefer a different document, simply provide a different URL or local file path.)
from rich.console import Console
from rich.panel import Panel
from docling.document_converter import DocumentConverter
console = Console()
# This URL points to the Microsoft GraphRAG Research Paper (arXiv: 2404.16130), ~15 pages
source_url = "https://arxiv.org/pdf/2404.16130"
console.print(
"[bold yellow]Parsing a ~15-page PDF. The process should be relatively quick, even on CPU...[/bold yellow]"
)
converter = DocumentConverter()
result = converter.convert(source_url)
# Optional: preview the parsed Markdown
md_preview = result.document.export_to_markdown()
console.print(Panel(md_preview[:500] + "...", title="Docling Markdown Preview"))
We convert the Document into smaller chunks for embedding and indexing. The built-in HierarchicalChunker preserves structure.
from docling.chunking import HierarchicalChunker
chunker = HierarchicalChunker()
doc_chunks = list(chunker.chunk(result.document))
all_chunks = []
for idx, c in enumerate(doc_chunks):
chunk_text = c.text
all_chunks.append((f"chunk_{idx}", chunk_text))
console.print(f"Total chunks from PDF: {len(all_chunks)}")
We’ll define a vector index in Azure AI Search, then embed each chunk using Azure OpenAI and upload in batches.
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
AzureOpenAIVectorizer,
AzureOpenAIVectorizerParameters,
HnswAlgorithmConfiguration,
SearchableField,
SearchField,
SearchFieldDataType,
SearchIndex,
SimpleField,
VectorSearch,
VectorSearchProfile,
)
from rich.console import Console
console = Console()
VECTOR_DIM = 1536 # Adjust based on your chosen embeddings model
index_client = SearchIndexClient(
AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY)
)
def create_search_index(index_name: str):
# Define fields
fields = [
SimpleField(name="chunk_id", type=SearchFieldDataType.String, key=True),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchField(
name="content_vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True,
filterable=False,
sortable=False,
facetable=False,
vector_search_dimensions=VECTOR_DIM,
vector_search_profile_name="default",
),
]
# Vector search config with an AzureOpenAIVectorizer
vector_search = VectorSearch(
algorithms=[HnswAlgorithmConfiguration(name="default")],
profiles=[
VectorSearchProfile(
name="default",
algorithm_configuration_name="default",
vectorizer_name="default",
)
],
vectorizers=[
AzureOpenAIVectorizer(
vectorizer_name="default",
parameters=AzureOpenAIVectorizerParameters(
resource_url=AZURE_OPENAI_ENDPOINT,
deployment_name=AZURE_OPENAI_EMBEDDINGS,
model_name="text-embedding-3-small",
api_key=AZURE_OPENAI_API_KEY,
),
)
],
)
# Create or update the index
new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
try:
index_client.delete_index(index_name)
except Exception:
pass
index_client.create_or_update_index(new_index)
console.print(f"Index '{index_name}' created.")
create_search_index(AZURE_SEARCH_INDEX_NAME)
from azure.search.documents import SearchClient
from openai import AzureOpenAI
search_client = SearchClient(
AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)
)
openai_client = AzureOpenAI(
api_key=AZURE_OPENAI_API_KEY,
api_version=AZURE_OPENAI_API_VERSION,
azure_endpoint=AZURE_OPENAI_ENDPOINT,
)
def embed_text(text: str):
"""
Helper to generate embeddings with Azure OpenAI.
"""
response = openai_client.embeddings.create(
input=text, model=AZURE_OPENAI_EMBEDDINGS
)
return response.data[0].embedding
upload_docs = []
for chunk_id, chunk_text in all_chunks:
embedding_vector = embed_text(chunk_text)
upload_docs.append(
{
"chunk_id": chunk_id,
"content": chunk_text,
"content_vector": embedding_vector,
}
)
BATCH_SIZE = 50
for i in range(0, len(upload_docs), BATCH_SIZE):
subset = upload_docs[i : i + BATCH_SIZE]
resp = search_client.upload_documents(documents=subset)
all_succeeded = all(r.succeeded for r in resp)
console.print(
f"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, "
f"first_doc_status_code: {resp[0].status_code}"
)
console.print("All chunks uploaded to Azure Search.")
Combine retrieval from Azure AI Search with Azure OpenAI Chat Completions (aka. grounding your LLM)
from typing import Optional
from azure.search.documents.models import VectorizableTextQuery
def generate_chat_response(prompt: str, system_message: Optional[str] = None):
"""
Generates a single-turn chat response using Azure OpenAI Chat.
If you need multi-turn conversation or follow-up queries, you'll have to
maintain the messages list externally.
"""
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
messages.append({"role": "user", "content": prompt})
completion = openai_client.chat.completions.create(
model=AZURE_OPENAI_CHAT_MODEL, messages=messages, temperature=0.7
)
return completion.choices[0].message.content
user_query = "What are the main advantages of using the Graph RAG approach for query-focused summarization compared to traditional RAG methods?"
user_embed = embed_text(user_query)
vector_query = VectorizableTextQuery(
text=user_query, # passing in text for a hybrid search
k_nearest_neighbors=5,
fields="content_vector",
)
search_results = search_client.search(
search_text=user_query, vector_queries=[vector_query], select=["content"], top=10
)
retrieved_chunks = []
for result in search_results:
snippet = result["content"]
retrieved_chunks.append(snippet)
context_str = "\n---\n".join(retrieved_chunks)
rag_prompt = f"""
You are an AI assistant helping answering questions about Microsoft GraphRAG.
Use ONLY the text below to answer the user's question.
If the answer isn't in the text, say you don't know.
Context:
{context_str}
Question: {user_query}
Answer:
"""
final_answer = generate_chat_response(rag_prompt)
console.print(Panel(rag_prompt, title="RAG Prompt", style="bold red"))
console.print(Panel(final_answer, title="RAG Response", style="bold green"))