examples/tokenizer_showcase.ipynb
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)
# treat surface forms of special tokens as actual special tokens
# the default, but unsafe (to be compatible with other projects)
# the same as tokenizer.encode("print('<|endoftext|>')<|endoftext|>", allowed_special='all', disallowed_special=())
tokenizer.encode("print('<|endoftext|>')<|endoftext|>")
tokenizer.decode([1350, 492, 151643, 863, 151643])
# treat texts just as texts, avoid injection attacks
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]
tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])
# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]
# fine-grained control, just keep mind of this:
# allowed_special is treated as special tokens
# disallowed_special raise errors
# allowed_special has higher priority than disallowed_special
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>",
allowed_special={'<|im_start|>', '<|im_end|>'},
disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>",
allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'},
disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]
# huggingface tokenizer has its own special token mechanism, so does tiktoken
# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None
tokenizer.unk_token
tokenizer.eos_token_id # use tokenizer.eod_id instead
tokenizer.pad_token_id
# use one of the extras such as <|extra_0|>
tokenizer.special_tokens['<|extra_0|>']
# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)
ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]
tokenizer.convert_ids_to_tokens(ids)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))
ids = tokenizer.encode("<|im_start|>print('我是一只猫<|extra_0|>')\n#喵喵喵<|im_end|>",
allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'},
disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]
tokenizer.convert_ids_to_tokens(ids)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))
tokenizer._convert_id_to_token(len(tokenizer)-1)
tokenizer._convert_token_to_id('<|extra_204|>')
tokenizer("我是一只猫")
tokenizer.encode("是一只猫")
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True, extra_vocab_file="qwen_extra.tiktoken")
len(tokenizer)
tokenizer("我是一只猫")
tokenizer.decode(tokenizer.encode("我是一只猫"))
tokenizer.encode("是一只猫")