"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "387217bd3e9a564cd84d4c4cc3c2f25ce30966bc"
Unverified Commit 5164ea91 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Skipping outputs (#3116)

* Minimal example

* Proposal 2

* Proposal 2 for fast tokenizers

* Typings

* Docs

* Revert "Docs" for easier review

This reverts commit eaf0f97062e809887704a542144c537f769d5223.

* Remove unnecessary assignments

* Tests

* Fix faulty type

* Remove prints

* return_outputs -> model_input_names

* Revert "Revert "Docs" for easier review"

This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722.

* code quality
parent 49debe62
...@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer): ...@@ -69,6 +69,7 @@ class DistilBertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
class DistilBertTokenizerFast(BertTokenizerFast): class DistilBertTokenizerFast(BertTokenizerFast):
...@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast): ...@@ -76,3 +77,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
...@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -119,6 +119,7 @@ class RobertaTokenizer(GPT2Tokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
...@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -244,6 +245,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
......
This diff is collapsed.
...@@ -48,7 +48,7 @@ class TokenizerTesterMixin: ...@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}] # to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
return [ return [
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()} {value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
for i in range(len(batch_encode_plus_sequences)) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
...@@ -261,7 +261,10 @@ class TokenizerTesterMixin: ...@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
def test_mask_output(self): def test_mask_output(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": if (
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
and "token_type_ids" in tokenizer.model_input_names
):
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
...@@ -504,51 +507,58 @@ class TokenizerTesterMixin: ...@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"] input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"] special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids) sequence_length = len(input_ids)
# Test right padding # Test right padding
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus( right_padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] right_padded_input_ids = right_padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
assert input_ids + [padding_idx] * padding_size == padded_input_ids right_padded_sequence_length = len(right_padded_input_ids)
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask assert sequence_length + padding_size == right_padded_sequence_length
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding # Test left padding
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus( left_padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] left_padded_input_ids = left_padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
padded_attention_mask = padded_sequence["attention_mask"] left_padded_sequence_length = len(left_padded_input_ids)
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length assert sequence_length + padding_size == left_padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
assert attention_mask + [0] * padding_size == right_padded_attention_mask
assert [0] * padding_size + attention_mask == left_padded_attention_mask
def test_separate_tokenizers(self): def test_separate_tokenizers(self):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment