Back to Graphrag

Encoding/Decoding

packages/graphrag-llm/notebooks/02_encoding_decoding.ipynb

3.0.92.4 KB
Original Source

Encoding/Decoding

LLMCompletion and LLMEmbedding expose a Tokenizer property corresponding to the underlying model.

python
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

import os

from dotenv import load_dotenv
from graphrag_llm.completion import LLMCompletion, create_completion
from graphrag_llm.config import AuthMethod, ModelConfig

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
model_config = ModelConfig(
    model_provider="azure",
    model=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    azure_deployment_name=os.getenv("GRAPHRAG_MODEL", "gpt-4o"),
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version=os.getenv("GRAPHRAG_API_VERSION", "2025-04-01-preview"),
    api_key=api_key,
    auth_method=AuthMethod.AzureManagedIdentity if not api_key else AuthMethod.ApiKey,
)
llm_completion: LLMCompletion = create_completion(model_config)

encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
# OR
print(f"Number of tokens: {llm_completion.tokenizer.num_tokens('Hello, world!')}")
decoded = llm_completion.tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")

Standalone Tokenizer

python
from graphrag_llm.config import TokenizerConfig, TokenizerType
from graphrag_llm.tokenizer import create_tokenizer

tokenizer = create_tokenizer(
    TokenizerConfig(
        type=TokenizerType.LiteLLM,
        model_id="openai/text-embedding-3-small",
    )
)

encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")
print(f"Number of tokens: {len(encoded)}")
decoded = tokenizer.decode(encoded)
print(f"Decoded text: {decoded}")

Tiktoken

By default, LLMCompletion and LLMEmbedding use a litellm based tokenizer that supports the 100+ models that litellm supports but you may use a tiktoken based tokenizer by specifying a tokenizer type of TokenizerType.Tiktoken and providing an encoding_name to the config.

python
tokenizer = create_tokenizer(
    TokenizerConfig(
        type=TokenizerType.Tiktoken,
        encoding_name="o200k_base",
    )
)
encoded = tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")

# Using with LLMCompletion
llm_completion: LLMCompletion = create_completion(model_config, tokenizer=tokenizer)

encoded = llm_completion.tokenizer.encode("Hello, world!")
print(f"Encoded tokens: {encoded}")