Back to Qwen

Encode and Decode

examples/tokenizer_showcase.ipynb

latest3.4 KB
Original Source
python
from transformers import AutoTokenizer
python
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)

Encode and Decode

python
# treat surface forms of special tokens as actual special tokens
# the default, but unsafe (to be compatible with other projects)
# the same as tokenizer.encode("print('<|endoftext|>')<|endoftext|>", allowed_special='all', disallowed_special=())
tokenizer.encode("print('<|endoftext|>')<|endoftext|>")
python
tokenizer.decode([1350, 492, 151643, 863, 151643])
python
# treat texts just as texts, avoid injection attacks
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]
python
tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])
python
# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]

python
# fine-grained control, just keep mind of this:
#    allowed_special is treated as special tokens
#    disallowed_special raise errors
#    allowed_special has higher priority than disallowed_special
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]
python
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

Special Token Management

python
# huggingface tokenizer has its own special token mechanism, so does tiktoken
# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None
tokenizer.unk_token
python
tokenizer.eos_token_id # use tokenizer.eod_id instead
python
tokenizer.pad_token_id 
python
# use one of the extras such as <|extra_0|>
tokenizer.special_tokens['<|extra_0|>']

Utility Methods

python
# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)
ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]
tokenizer.convert_ids_to_tokens(ids)
python
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))
python
ids = tokenizer.encode("<|im_start|>print('我是一只猫<|extra_0|>')\n#喵喵喵<|im_end|>", 
                 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
                 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]
python
tokenizer.convert_ids_to_tokens(ids)
python
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))
python
tokenizer._convert_id_to_token(len(tokenizer)-1)
python
tokenizer._convert_token_to_id('<|extra_204|>')

Vocabulary Expansion

python
tokenizer("我是一只猫")
python
tokenizer.encode("是一只猫")
python
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, extra_vocab_file="qwen_extra.tiktoken")

python
len(tokenizer)
python
tokenizer("我是一只猫")
python
tokenizer.decode(tokenizer.encode("我是一只猫"))
python
tokenizer.encode("是一只猫")