Unverified Commit 3d66146a authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Fixing tests for Perceiver (#14745)



- Do not run image-classification pipeline (_CHECKPOINT_FOR_DOC uses the checkpoint for
langage, which cannot load a FeatureExtractor so current logic fails).
- Add a safeguard to not run tests when `tokenizer_class` or
`feature_extractor_class` **are** defined, but cannot be loaded
This happens for Perceiver for the "FastTokenizer" (which doesn't exist
so None) and FeatureExtractor (which does exist but cannot be loaded
because the checkpoint doesn't define one which is reasonable for the
said checkpoint)
- Added `get_vocab` function to `PerceiverTokenizer` since it is used by
`fill-mask` pipeline when the argument `targets` is used to narrow a
subset of possible values.
Co-authored-by: default avatarNicolas Patry <patry.nicolas@protonmail.com>
parent 4c99e553
...@@ -43,6 +43,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( ...@@ -43,6 +43,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
("detr", "DetrFeatureExtractor"), ("detr", "DetrFeatureExtractor"),
("layoutlmv2", "LayoutLMv2FeatureExtractor"), ("layoutlmv2", "LayoutLMv2FeatureExtractor"),
("clip", "CLIPFeatureExtractor"), ("clip", "CLIPFeatureExtractor"),
("perceiver", "PerceiverFeatureExtractor"),
] ]
) )
......
...@@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer):
self._utf_vocab_size = 2 ** 8 # utf is 8 bits self._utf_vocab_size = 2 ** 8 # utf is 8 bits
# define special tokens dict # define special tokens dict
self.special_tokens_encoder: Dict[int, str] = { self.special_tokens_encoder: Dict[str, int] = {
self.pad_token: 0, self.pad_token: 0,
self.bos_token: 1, self.bos_token: 1,
self.eos_token: 2, self.eos_token: 2,
...@@ -96,7 +96,15 @@ class PerceiverTokenizer(PreTrainedTokenizer): ...@@ -96,7 +96,15 @@ class PerceiverTokenizer(PreTrainedTokenizer):
self.sep_token: 5, self.sep_token: 5,
} }
self._num_special_tokens = len(self.special_tokens_encoder) self._num_special_tokens = len(self.special_tokens_encoder)
self.special_tokens_decoder: Dict[str, int] = {v: k for k, v in self.special_tokens_encoder.items()} self.special_tokens_decoder: Dict[int, str] = {v: k for k, v in self.special_tokens_encoder.items()}
def get_vocab(self) -> Dict[str, int]:
vocab = self.special_tokens_encoder.copy()
vocab.update(self.added_tokens_encoder)
for i in range(self._utf_vocab_size):
token = chr(i)
vocab[token] = i + len(self.special_tokens_encoder)
return vocab
@property @property
def vocab_size(self): def vocab_size(self):
......
...@@ -169,6 +169,11 @@ class PipelineTestCaseMeta(type): ...@@ -169,6 +169,11 @@ class PipelineTestCaseMeta(type):
else: else:
tokenizer = None tokenizer = None
feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config) feature_extractor = get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config)
if tokenizer is None and feature_extractor is None:
self.skipTest(
f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor (PerceiverConfig with no FastTokenizer ?)"
)
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor) pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor)
if pipeline is None: if pipeline is None:
# The test can disable itself, but it should be very marginal # The test can disable itself, but it should be very marginal
...@@ -213,6 +218,7 @@ class PipelineTestCaseMeta(type): ...@@ -213,6 +218,7 @@ class PipelineTestCaseMeta(type):
if not tokenizer_classes: if not tokenizer_classes:
# We need to test even if there are no tokenizers. # We need to test even if there are no tokenizers.
tokenizer_classes = [None] tokenizer_classes = [None]
for tokenizer_class in tokenizer_classes: for tokenizer_class in tokenizer_classes:
if tokenizer_class is not None: if tokenizer_class is not None:
tokenizer_name = tokenizer_class.__name__ tokenizer_name = tokenizer_class.__name__
......
...@@ -14,7 +14,12 @@ ...@@ -14,7 +14,12 @@
import unittest import unittest
from transformers import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, PreTrainedTokenizer, is_vision_available from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
PerceiverConfig,
PreTrainedTokenizer,
is_vision_available,
)
from transformers.pipelines import ImageClassificationPipeline, pipeline from transformers.pipelines import ImageClassificationPipeline, pipeline
from transformers.testing_utils import ( from transformers.testing_utils import (
is_pipeline_test, is_pipeline_test,
...@@ -45,6 +50,10 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -45,6 +50,10 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor): def get_test_pipeline(self, model, tokenizer, feature_extractor):
if isinstance(model.config, PerceiverConfig):
self.skipTest(
"Perceiver model tester is defined with a language one, which has no feature_extractor, so the automated test cannot work here"
)
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor) image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
examples = [ examples = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment