"git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "f902c5d4ca3930205e3ac6206eac6f9b258beb01"
Unverified Commit 1240be3e authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #2312 from vitaliyradchenko/fix_special_and_add_tokens_loading

Correct tokenization for special and added tokens
parents cea04a24 b262577d
......@@ -469,6 +469,9 @@ class PreTrainedTokenizer(object):
tokenizer.init_inputs = init_inputs
tokenizer.init_kwargs = init_kwargs
# update unique_added_tokens_encoder with special tokens for correct tokenization
tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens))
# Add supplementary tokens.
if added_tokens_file is not None:
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
......@@ -476,6 +479,7 @@ class PreTrainedTokenizer(object):
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
tokenizer.added_tokens_encoder.update(added_tok_encoder)
tokenizer.added_tokens_decoder.update(added_tok_decoder)
tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()))
return tokenizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment