python

from transformers import AutoTokenizer

python

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)

Encode and Decode

python

# treat surface forms of special tokens as actual special tokens
# the default, but unsafe (to be compatible with other projects)
# the same as tokenizer.encode("print('<|endoftext|>')<|endoftext|>", allowed_special='all', disallowed_special=())
tokenizer.encode("print('<|endoftext|>')<|endoftext|>")

python

tokenizer.decode([1350, 492, 151643, 863, 151643])

python

# treat texts just as texts, avoid injection attacks
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]

python

tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])

python

# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]

python

# fine-grained control, just keep mind of this:
#    allowed_special is treated as special tokens
#    disallowed_special raise errors
#    allowed_special has higher priority than disallowed_special
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

python

tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

Special Token Management

python

# huggingface tokenizer has its own special token mechanism, so does tiktoken
# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None
tokenizer.unk_token

python

tokenizer.eos_token_id # use tokenizer.eod_id instead

python

tokenizer.pad_token_id

python

# use one of the extras such as <|extra_0|>
tokenizer.special_tokens['<|extra_0|>']

Utility Methods

python

# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)
ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]
tokenizer.convert_ids_to_tokens(ids)

python

tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))

python

ids = tokenizer.encode("<|im_start|>print('我是一只猫<|extra_0|>')\n#喵喵喵<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

python

tokenizer.convert_ids_to_tokens(ids)

python

tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))

python

tokenizer._convert_id_to_token(len(tokenizer)-1)

python

tokenizer._convert_token_to_id('<|extra_204|>')

Vocabulary Expansion

python

tokenizer("我是一只猫")

python

tokenizer.encode("是一只猫")

python

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, extra_vocab_file="qwen_extra.tiktoken")

python

len(tokenizer)

python

tokenizer("我是一只猫")

python

tokenizer.decode(tokenizer.encode("我是一只猫"))

python

tokenizer.encode("是一只猫")