"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "db98a4a48b79bd11e24acd4b9ea2fdf17075f9e5"
Unverified Commit 085ea7e5 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`CodeLlamaTokenizer`] Nit, update __init__ to make sure the AddedTokens are...

[`CodeLlamaTokenizer`] Nit, update __init__ to make sure the AddedTokens are not normalized because they are special (#27359)

* make sure tokens are properly initialized for codellama slow

* add m ore pretrained models

* style

* test more tokenizers checkpoints
parent 7ecd229b
...@@ -149,9 +149,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -149,9 +149,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
): ):
requires_backends(self, "protobuf") requires_backends(self, "protobuf")
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
self.use_default_system_prompt = use_default_system_prompt self.use_default_system_prompt = use_default_system_prompt
# mark tokens special to skip them # mark tokens special to skip them
......
...@@ -150,6 +150,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -150,6 +150,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list = [ self.tokenizers_list = [
(self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
(self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}), (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
(self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
(self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
] ]
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment