Unverified Commit 7c87f357 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`T5 and Llama Tokenizer`] remove warning (#29346)



* remove warning

* add co-author

* update

---------
Co-authored-by: default avatarhiaoxui <hiaoxui@users.noreply.github.com>
parent a5288852
...@@ -243,7 +243,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -243,7 +243,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
return vocab return vocab
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]: def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
""" """
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special. first token is special.
...@@ -255,7 +255,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -255,7 +255,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
if self.add_prefix_space: if self.add_prefix_space:
text = SPIECE_UNDERLINE + text text = SPIECE_UNDERLINE + text
tokens = super().tokenize(text, add_special_tokens=add_special_tokens, **kwargs) tokens = super().tokenize(text, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:] tokens = tokens[1:]
......
...@@ -447,7 +447,7 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer): ...@@ -447,7 +447,7 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer):
return tokenizer return tokenizer
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]: def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
""" """
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special. first token is special.
...@@ -459,7 +459,7 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer): ...@@ -459,7 +459,7 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer):
if self.add_prefix_space: if self.add_prefix_space:
text = SPIECE_UNDERLINE + text text = SPIECE_UNDERLINE + text
tokens = super().tokenize(text, add_special_tokens=add_special_tokens, **kwargs) tokens = super().tokenize(text, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:] tokens = tokens[1:]
......
...@@ -377,7 +377,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -377,7 +377,7 @@ class T5Tokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]: def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
""" """
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special. first token is special.
...@@ -389,7 +389,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -389,7 +389,7 @@ class T5Tokenizer(PreTrainedTokenizer):
if self.add_prefix_space: if self.add_prefix_space:
text = SPIECE_UNDERLINE + text text = SPIECE_UNDERLINE + text
tokens = super().tokenize(text, add_special_tokens=add_special_tokens, **kwargs) tokens = super().tokenize(text, **kwargs)
if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens: if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:] tokens = tokens[1:]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment