Rate Limiting

Rate limiting is disabled by default. Requests can be limited by either requests per period or tokens per period or both.

python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

import json
import os
import time

from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig, RateLimitConfig, RateLimitType

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
    model_provider="azure",
    model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
    api_key=api_key,
    auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
    rate_limit=RateLimitConfig(
        type=RateLimitType.SlidingWindow,
        period_in_seconds=60,  # limit requests per minute
        requests_per_period=3,  # max 3 requests per minute. Fire one off every 20 seconds
    ),
)

llm_completion: LLMCompletion = create_completion(model_config)

start_time = time.time()
response = llm_completion.completion(
    messages="What is the capital of France?",
)
response = llm_completion.completion(
    messages="What is the capital of France?",
)
end_time = time.time()
total_time = end_time - start_time
assert total_time >= 20, "Rate limiting did not work as expected."

print(f"Time taken for two requests: {total_time:.2f} seconds")
print(f"Metrics for: {llm_completion.metrics_store.id}")
print(json.dumps(llm_completion.metrics_store.get_metrics(), indent=2))

Notice that the compute_duration_seconds in the metrics only tracks how long a network request actually takes and does track paused periods that occur due to rate limits.