packages/graphrag-llm/notebooks/02_encoding_decoding.ipynb
LLMCompletion and LLMEmbedding expose a Tokenizer property corresponding to the underlying model.
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
import os
from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig
load_dotenv()
api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
model_provider="azure",
model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
api_base=os.getenv("GRAPHRAG_API_BASE"),
api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
api_key=api_key,
auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
)
llm_completion: LLMCompletion = create_completion(model_config)
encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
# OR
print(f"Number of tokens: {llm_completion.tokenizer.num_tokens('Hello, world!')}")
decoded = llm_completion.tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")
from graphrag_llm.config import TokenizerConfig, TokenizerType
from graphrag_llm.tokenizer import create_tokenizer
tokenizer = create_tokenizer(
TokenizerConfig(
type=TokenizerType.LiteLLM,
model_id="openai/text-embedding-3-small",
)
)
encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
decoded = tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")
By default, LLMCompletion and LLMEmbedding use a litellm based tokenizer that supports the 100+ models that litellm supports but you may use a tiktoken based tokenizer by specifying a tokenizer type of TokenizerType.Tiktoken and providing an encoding_name to the config.
tokenizer = create_tokenizer(
TokenizerConfig(
type=TokenizerType.Tiktoken,
encoding_name="o200k_base",
)
)
encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
# Using with LLMCompletion
llm_completion: LLMCompletion = create_completion(model_config, tokenizer=tokenizer)
encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")