Caching

To enabling caching, pass in a Cache instance to the create_completion or create_embedding functions.

python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

import json
import os

from dotenv import load_dotenv
from graphrag_cache import CacheConfig, CacheType, create_cache
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig
from graphrag_storage import StorageConfig, StorageType

load_dotenv()

cache = create_cache()
# The above default is equivalent to:
cache = create_cache(
    CacheConfig(
        type=CacheType.Json,
        storage=StorageConfig(type=StorageType.File, base_dir="cache"),
    )
)

api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
    model_provider="azure",
    model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
    api_key=api_key,
    auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
)
llm_completion: LLMCompletion = create_completion(model_config, cache=cache)

response = llm_completion.completion(
    messages="What is the capital of France?",
)
response = llm_completion.completion(
    messages="What is the capital of France?",
)

print(f"Metrics for: {llm_completion.metrics_store.id}")
metrics = llm_completion.metrics_store.get_metrics()
print(json.dumps(metrics, indent=2))
assert metrics["cached_responses"] == 1

Note on the above metrics

cached_responses == 1 since the request was cached by the time the second call was made.

The cached_responses indicates how many cache hits occurred but the rest of the metrics exist as if a cache was not used. For example, compute_duration_seconds and all the token counts and cost counts are as if cache was not used. This is because both the response and metrics are cached and retrieved from the cache when a cache hit occurs. Metrics were designed to give an idea of how long and costly a job would be if there were no cache.

Tests

This is in here because notebooks are being used as integration tests. This ensures objects are being loaded and deserialized from cache properly and the cache is bypassing the rate limiting.

Test Timing

python

import time

from graphrag_llm.config import RateLimitConfig, RateLimitType

model_config.rate_limit = RateLimitConfig(
    type=RateLimitType.SlidingWindow,
    period_in_seconds=60,  # limit requests per minute
    requests_per_period=1,  # max 1 request per minute. Without cache this would take forever
)
llm_completion: LLMCompletion = create_completion(model_config, cache=cache)

start_time = time.time()
for _ in range(100):
    response = llm_completion.completion(
        messages="What is the capital of France?",
    )
end_time = time.time()
total_time = end_time - start_time
print(f"Total time for 100 requests: {total_time} seconds")
assert total_time < 5.0  # Ensure that caching is effective

Test Structured Responses

python

from graphrag_llm.types import LLMCompletionResponse
from pydantic import BaseModel, Field


class LocalWeather(BaseModel):
    """City weather information model."""

    city: str = Field(description="The name of the city")
    temperature: float = Field(description="The temperature in Celsius")
    condition: str = Field(description="The weather condition description")


class WeatherReports(BaseModel):
    """Weather information model."""

    reports: list[LocalWeather] = Field(
        description="The weather reports for multiple cities"
    )


llm_completion.metrics_store.clear_metrics()
response: LLMCompletionResponse[WeatherReports] = llm_completion.completion(  # type: ignore
    messages="It is sunny and 52 degrees fahrenheit in Seattle. It is cloudy and 75 degrees fahrenheit in San Francisco.",
    response_format=WeatherReports,
)  # type: ignore
response: LLMCompletionResponse[WeatherReports] = llm_completion.completion(  # type: ignore
    messages="It is sunny and 52 degrees fahrenheit in Seattle. It is cloudy and 75 degrees fahrenheit in San Francisco.",
    response_format=WeatherReports,
)  # type: ignore

metrics = llm_completion.metrics_store.get_metrics()
assert metrics["cached_responses"] == 1, (
    f"Expected 1 cached response, got {metrics['cached_responses']}"
)


# Changing the response format should not hit the cache and
# instead be a new request and store a new response in the cache.


class WeatherReports2(BaseModel):
    """Weather information model."""

    local_reports: list[LocalWeather] = Field(
        description="The weather reports for multiple cities"
    )


llm_completion.metrics_store.clear_metrics()
# Same request but different response format. Should not hit cache.
response: LLMCompletionResponse[WeatherReports2] = llm_completion.completion(
    messages="It is sunny and 52 degrees fahrenheit in Seattle. It is cloudy and 75 degrees fahrenheit in San Francisco.",
    response_format=WeatherReports2,
)  # type: ignore

metrics = llm_completion.metrics_store.get_metrics()
assert metrics.get("cached_responses", 0) == 0, (
    f"Expected 0 cached responses, got {metrics['cached_responses']}"
)