Unverified Commit 538b3b46 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Tokenizer Utils Base] Make pad function more flexible (#9928)

* change tokenizer requirement

* split line

* Correct typo from list to str

* improve style

* make other function pretty as well

* add comment

* correct typo

* add new test

* pass tests for tok without padding token

* Apply suggestions from code review
parent d1b14c9b
...@@ -64,7 +64,7 @@ def get_tfds( ...@@ -64,7 +64,7 @@ def get_tfds(
label_name = features_name.pop(label_column_id) label_name = features_name.pop(label_column_id)
label_list = list(set(ds[list(files.keys())[0]][label_name])) label_list = list(set(ds[list(files.keys())[0]][label_name]))
label2id = {label: i for i, label in enumerate(label_list)} label2id = {label: i for i, label in enumerate(label_list)}
input_names = ["input_ids"] + tokenizer.model_input_names input_names = tokenizer.model_input_names
transformed_ds = {} transformed_ds = {}
if len(features_name) == 1: if len(features_name) == 1:
......
...@@ -98,7 +98,7 @@ if is_tf_available(): ...@@ -98,7 +98,7 @@ if is_tf_available():
label = d.pop("label") label = d.pop("label")
yield (d, label) yield (d, label)
input_names = ["input_ids"] + tokenizer.model_input_names input_names = tokenizer.model_input_names
return tf.data.Dataset.from_generator( return tf.data.Dataset.from_generator(
gen, gen,
......
...@@ -97,7 +97,7 @@ class BarthezTokenizer(PreTrainedTokenizer): ...@@ -97,7 +97,7 @@ class BarthezTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -106,7 +106,7 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast): ...@@ -106,7 +106,7 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = BarthezTokenizer slow_tokenizer_class = BarthezTokenizer
def __init__( def __init__(
......
...@@ -92,7 +92,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): ...@@ -92,7 +92,7 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
}, },
} }
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512} max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -100,7 +100,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -100,7 +100,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -110,7 +110,7 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): ...@@ -110,7 +110,7 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = CamembertTokenizer slow_tokenizer_class = CamembertTokenizer
def __init__( def __init__(
......
...@@ -68,4 +68,4 @@ class DistilBertTokenizer(BertTokenizer): ...@@ -68,4 +68,4 @@ class DistilBertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
...@@ -77,5 +77,5 @@ class DistilBertTokenizerFast(BertTokenizerFast): ...@@ -77,5 +77,5 @@ class DistilBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = DistilBertTokenizer slow_tokenizer_class = DistilBertTokenizer
...@@ -385,4 +385,4 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer): ...@@ -385,4 +385,4 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
...@@ -387,5 +387,5 @@ class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast): ...@@ -387,5 +387,5 @@ class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = DPRReaderTokenizer slow_tokenizer_class = DPRReaderTokenizer
...@@ -177,7 +177,7 @@ class FSMTTokenizer(PreTrainedTokenizer): ...@@ -177,7 +177,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -148,7 +148,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -148,7 +148,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -116,7 +116,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): ...@@ -116,7 +116,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = GPT2Tokenizer slow_tokenizer_class = GPT2Tokenizer
def __init__( def __init__(
......
...@@ -92,7 +92,7 @@ class MarianTokenizer(PreTrainedTokenizer): ...@@ -92,7 +92,7 @@ class MarianTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
language_code_re = re.compile(">>.+<<") # type: re.Pattern language_code_re = re.compile(">>.+<<") # type: re.Pattern
def __init__( def __init__(
......
...@@ -122,7 +122,7 @@ class MPNetTokenizer(PreTrainedTokenizer): ...@@ -122,7 +122,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -102,7 +102,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -102,7 +102,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = MPNetTokenizer slow_tokenizer_class = MPNetTokenizer
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
...@@ -94,7 +94,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -94,7 +94,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super().__init__(unk_token=unk_token, **kwargs) super().__init__(unk_token=unk_token, **kwargs)
......
...@@ -61,7 +61,7 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): ...@@ -61,7 +61,7 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = OpenAIGPTTokenizer slow_tokenizer_class = OpenAIGPTTokenizer
def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="<unk>", **kwargs): def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="<unk>", **kwargs):
......
...@@ -84,7 +84,7 @@ class PegasusTokenizer(PreTrainedTokenizer): ...@@ -84,7 +84,7 @@ class PegasusTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment