packages/graphrag-llm/notebooks/07_rate_limiting.ipynb
Rate limiting is disabled by default. Requests can be limited by either requests per period or tokens per period or both.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import json
import os
import time
from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig, RateLimitConfig, RateLimitType
load_dotenv()
api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
model_provider="azure",
model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
api_base=os.getenv("GRAPHRAG_API_BASE"),
api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
api_key=api_key,
auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
rate_limit=RateLimitConfig(
type=RateLimitType.SlidingWindow,
period_in_seconds=60, # limit requests per minute
requests_per_period=3, # max 3 requests per minute. Fire one off every 20 seconds
),
)
llm_completion: LLMCompletion = create_completion(model_config)
start_time = time.time()
response = llm_completion.completion(
messages="What is the capital of France?",
)
response = llm_completion.completion(
messages="What is the capital of France?",
)
end_time = time.time()
total_time = end_time - start_time
assert total_time >= 20, "Rate limiting did not work as expected."
print(f"Time taken for two requests: {total_time:.2f} seconds")
print(f"Metrics for: {llm_completion.metrics_store.id}")
print(json.dumps(llm_completion.metrics_store.get_metrics(), indent=2))
Notice that the compute_duration_seconds in the metrics only tracks how long a network request actually takes and does track paused periods that occur due to rate limits.