"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "4d367a3c8101bd72a711445d305005133ad15e73"
Unverified Commit c749bd40 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Pipeline testing - using tiny models on Hub (#20426)



* rework pipeline tests

* run pipeline tests

* fix

* fix

* fix

* revert the changes in get_test_pipeline() parameter list

* fix expected error message

* skip a test

* clean up

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent a582cfce
...@@ -434,7 +434,7 @@ def create_circleci_config(folder=None): ...@@ -434,7 +434,7 @@ def create_circleci_config(folder=None):
example_file = os.path.join(folder, "examples_test_list.txt") example_file = os.path.join(folder, "examples_test_list.txt")
if os.path.exists(example_file) and os.path.getsize(example_file) > 0: if os.path.exists(example_file) and os.path.getsize(example_file) > 0:
jobs.extend(EXAMPLES_TESTS) jobs.extend(EXAMPLES_TESTS)
repo_util_file = os.path.join(folder, "test_repo_utils.txt") repo_util_file = os.path.join(folder, "test_repo_utils.txt")
if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0: if os.path.exists(repo_util_file) and os.path.getsize(repo_util_file) > 0:
jobs.extend(REPO_UTIL_TESTS) jobs.extend(REPO_UTIL_TESTS)
......
...@@ -27,8 +27,8 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta ...@@ -27,8 +27,8 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class AudioClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=feature_extractor) audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor)
# test with a raw waveform # test with a raw waveform
audio = np.zeros((34000,)) audio = np.zeros((34000,))
......
...@@ -60,7 +60,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel ...@@ -60,7 +60,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
+ (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else []) + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
} }
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
if tokenizer is None: if tokenizer is None:
# Side effect of no Fast Tokenizer class for these model, so skipping # Side effect of no Fast Tokenizer class for these model, so skipping
# But the slow tokenizer test should still run as they're quite small # But the slow tokenizer test should still run as they're quite small
...@@ -69,7 +69,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel ...@@ -69,7 +69,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
# return None, None # return None, None
speech_recognizer = AutomaticSpeechRecognitionPipeline( speech_recognizer = AutomaticSpeechRecognitionPipeline(
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor model=model, tokenizer=tokenizer, feature_extractor=processor
) )
# test with a raw waveform # test with a raw waveform
...@@ -133,7 +133,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel ...@@ -133,7 +133,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
) )
else: else:
# Non CTC models cannot use return_timestamps # Non CTC models cannot use return_timestamps
with self.assertRaisesRegex(ValueError, "^We cannot return_timestamps yet on non-ctc models !$"): with self.assertRaisesRegex(
ValueError, "^We cannot return_timestamps yet on non-ctc models apart from Whisper !$"
):
outputs = speech_recognizer(audio, return_timestamps="char") outputs = speech_recognizer(audio, return_timestamps="char")
@require_torch @require_torch
......
...@@ -17,26 +17,20 @@ import importlib ...@@ -17,26 +17,20 @@ import importlib
import logging import logging
import os import os
import random import random
import string
import sys import sys
import tempfile import tempfile
import unittest import unittest
from abc import abstractmethod from abc import abstractmethod
from functools import lru_cache
from pathlib import Path from pathlib import Path
from unittest import skipIf from unittest import skipIf
import datasets import datasets
import numpy as np import numpy as np
import requests
from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import ( from transformers import (
FEATURE_EXTRACTOR_MAPPING,
IMAGE_PROCESSOR_MAPPING,
TOKENIZER_MAPPING,
AutoFeatureExtractor,
AutoImageProcessor,
AutoModelForSequenceClassification, AutoModelForSequenceClassification,
AutoTokenizer, AutoTokenizer,
DistilBertForSequenceClassification, DistilBertForSequenceClassification,
...@@ -71,123 +65,16 @@ from test_module.custom_pipeline import PairClassificationPipeline # noqa E402 ...@@ -71,123 +65,16 @@ from test_module.custom_pipeline import PairClassificationPipeline # noqa E402
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS = [ PATH_TO_TRANSFORMERS = os.path.join(Path(__file__).parent.parent.parent, "src/transformers")
"CamembertConfig",
"IBertConfig",
"LongformerConfig",
"MarkupLMConfig",
"RobertaConfig",
"RobertaPreLayerNormConfig",
"XLMRobertaConfig",
]
def get_checkpoint_from_architecture(architecture):
try:
module = importlib.import_module(architecture.__module__)
except ImportError:
logger.error(f"Ignoring architecture {architecture}")
return
if hasattr(module, "_CHECKPOINT_FOR_DOC"):
return module._CHECKPOINT_FOR_DOC
else:
logger.warning(f"Can't retrieve checkpoint from {architecture.__name__}")
def get_tiny_config_from_class(configuration_class):
if "OpenAIGPT" in configuration_class.__name__:
# This is the only file that is inconsistent with the naming scheme.
# Will rename this file if we decide this is the way to go
return
model_type = configuration_class.model_type
camel_case_model_name = configuration_class.__name__.split("Config")[0]
try:
model_slug = model_type.replace("-", "_")
module = importlib.import_module(f".test_modeling_{model_slug}", package=f"tests.models.{model_slug}")
model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
except (ImportError, AttributeError):
logger.error(f"No model tester class for {configuration_class.__name__}")
return
if model_tester_class is None:
logger.warning(f"No model tester class for {configuration_class.__name__}")
return
model_tester = model_tester_class(parent=None)
if hasattr(model_tester, "get_pipeline_config"):
config = model_tester.get_pipeline_config()
elif hasattr(model_tester, "get_config"):
config = model_tester.get_config()
else:
config = None
logger.warning(f"Model tester {model_tester_class.__name__} has no `get_config()`.")
return config
@lru_cache(maxsize=100)
def get_tiny_tokenizer_from_checkpoint(checkpoint):
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
if tokenizer.vocab_size < 300:
# Wav2Vec2ForCTC for instance
# ByT5Tokenizer
# all are already small enough and have no Fast version that can
# be retrained
return tokenizer
logger.info("Training new from iterator ...")
vocabulary = string.ascii_letters + string.digits + " "
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
logger.info("Trained.")
return tokenizer
def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_extractor_class):
try:
feature_extractor = AutoFeatureExtractor.from_pretrained(checkpoint)
except Exception:
try:
if feature_extractor_class is not None:
feature_extractor = feature_extractor_class()
else:
feature_extractor = None
except Exception:
feature_extractor = None
# Audio Spectogram Transformer specific.
if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
feature_extractor = feature_extractor.__class__(
max_length=tiny_config.max_length, num_mel_bins=tiny_config.num_mel_bins
)
# Speech2TextModel specific.
if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
feature_extractor = feature_extractor.__class__(
feature_size=tiny_config.input_feat_per_channel, num_mel_bins=tiny_config.input_feat_per_channel
)
# TODO remove this, once those have been moved to `image_processor`.
if hasattr(tiny_config, "image_size") and feature_extractor:
feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
return feature_extractor
def get_tiny_image_processor_from_checkpoint(checkpoint, tiny_config, image_processor_class): # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
try: spec = importlib.util.spec_from_file_location(
image_processor = AutoImageProcessor.from_pretrained(checkpoint) "transformers",
except Exception: os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"),
try: submodule_search_locations=[PATH_TO_TRANSFORMERS],
if image_processor_class is not None: )
image_processor = image_processor_class() transformers_module = spec.loader.load_module()
else:
image_processor = None
except Exception:
image_processor = None
if hasattr(tiny_config, "image_size") and image_processor:
image_processor = image_processor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
return image_processor
class ANY: class ANY:
...@@ -201,76 +88,171 @@ class ANY: ...@@ -201,76 +88,171 @@ class ANY:
return f"ANY({', '.join(_type.__name__ for _type in self._types)})" return f"ANY({', '.join(_type.__name__ for _type in self._types)})"
class PipelineTestCaseMeta(type): def is_test_to_skip(test_casse_name, config_class, model_architecture, tokenizer_name, processor_name):
def __new__(mcs, name, bases, dct): """Some tests are just not working"""
def gen_test(
ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extractor_class, image_processor_class to_skip = False
if config_class.__name__ == "RoCBertConfig" and test_casse_name in [
"FillMaskPipelineTests",
"FeatureExtractionPipelineTests",
"TextClassificationPipelineTests",
"TokenClassificationPipelineTests",
]:
# Get error: IndexError: index out of range in self.
# `word_shape_file` and `word_pronunciation_file` should be shrunk during tiny model creation,
# otherwise `IndexError` could occur in some embedding layers. Skip for now until this model has
# more usage.
to_skip = True
elif config_class.__name__ in ["LayoutLMv3Config", "LiltConfig"]:
# Get error: ValueError: Words must be of type `List[str]`. Previously, `LayoutLMv3` is not
# used in pipeline tests as it could not find a checkpoint
# TODO: check and fix if possible
to_skip = True
# config/model class we decide to skip
elif config_class.__name__ in ["TapasConfig"]:
# Get error: AssertionError: Table must be of type pd.DataFrame. Also, the tiny model has large
# vocab size as the fast tokenizer could not be converted. Previous, `Tapas` is not used in
# pipeline tests due to the same reason.
# TODO: check and fix if possible
to_skip = True
# TODO: check and fix if possible
if not to_skip and tokenizer_name is not None:
if (
test_casse_name == "QAPipelineTests"
and not tokenizer_name.endswith("Fast")
and config_class.__name__
in [
"FlaubertConfig",
"GPTJConfig",
"LongformerConfig",
"MvpConfig",
"OPTConfig",
"ReformerConfig",
"XLMConfig",
]
): ):
@skipIf( # `QAPipelineTests` fails for a few models when the slower tokenizer are used.
tiny_config is None, # (The slower tokenizers were never used for pipeline tests before the pipeline testing rework)
"TinyConfig does not exist, make sure that you defined a `_CONFIG_FOR_DOC` variable in the modeling" # TODO: check (and possibly fix) the `QAPipelineTests` with slower tokenizer
" file", to_skip = True
elif test_casse_name == "ZeroShotClassificationPipelineTests" and config_class.__name__ in [
"CTRLConfig",
"OpenAIGPTConfig",
]:
# Get `tokenizer does not have a padding token` error for both fast/slow tokenizers.
# `CTRLConfig` and `OpenAIGPTConfig` were never used in pipeline tests, either because of a missing
# checkpoint or because a tiny config could not be created
to_skip = True
elif test_casse_name == "TranslationPipelineTests" and config_class.__name__ in [
"M2M100Config",
"PLBartConfig",
]:
# Get `ValueError: Translation requires a `src_lang` and a `tgt_lang` for this model`.
# `M2M100Config` and `PLBartConfig` were never used in pipeline tests: cannot create a simple tokenizer
to_skip = True
elif test_casse_name == "TextGenerationPipelineTests" and config_class.__name__ in [
"ProphetNetConfig",
"TransfoXLConfig",
]:
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
# `TransfoXLConfig` and `ProphetNetConfig` were never used in pipeline tests: cannot create a simple
# tokenizer.
to_skip = True
elif test_casse_name == "FillMaskPipelineTests" and config_class.__name__ in [
"FlaubertConfig",
"XLMConfig",
]:
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
# `FlaubertConfig` and `TransfoXLConfig` were never used in pipeline tests: cannot create a simple
# tokenizer
to_skip = True
elif test_casse_name == "TextGenerationPipelineTests" and model_architecture.__name__ in [
"TFRoFormerForCausalLM"
]:
# TODO: add `prepare_inputs_for_generation` for `TFRoFormerForCausalLM`
to_skip = True
elif test_casse_name == "QAPipelineTests" and model_architecture.__name__ in ["FNetForQuestionAnswering"]:
# TODO: The change in `base.py` in the PR #21132 (https://github.com/huggingface/transformers/pull/21132)
# fails this test case. Skip for now - a fix for this along with the initial changes in PR #20426 is
# too much. Let `ydshieh` to fix it ASAP once #20426 is merged.
to_skip = True
return to_skip
def validate_test_components(test_case, model, tokenizer, processor):
# TODO: Move this to tiny model creation script
# head-specific (within a model type) necessary changes to the config
# 1. for `BlenderbotForCausalLM`
if model.__class__.__name__ == "BlenderbotForCausalLM":
model.config.encoder_no_repeat_ngram_size = 0
# TODO: Change the tiny model creation script: don't create models with problematic tokenizers
# Avoid `IndexError` in embedding layers
CONFIG_WITHOUT_VOCAB_SIZE = ["CanineConfig"]
if tokenizer is not None:
config_vocab_size = getattr(model.config, "vocab_size", None)
# For CLIP-like models
if config_vocab_size is None and hasattr(model.config, "text_config"):
config_vocab_size = getattr(model.config.text_config, "vocab_size", None)
if config_vocab_size is None and model.config.__class__.__name__ not in CONFIG_WITHOUT_VOCAB_SIZE:
raise ValueError(
"Could not determine `vocab_size` from model configuration while `tokenizer` is not `None`."
) )
# TODO: Remove tiny models from the Hub which have problematic tokenizers (but still keep this block)
if config_vocab_size is not None and len(tokenizer) > config_vocab_size:
test_case.skipTest(
f"Ignore {model.__class__.__name__}: `tokenizer` ({tokenizer.__class__.__name__}) has"
f" {len(tokenizer)} tokens which is greater than `config_vocab_size`"
f" ({config_vocab_size}). Something is wrong."
)
class PipelineTestCaseMeta(type):
def __new__(mcs, name, bases, dct):
def gen_test(repo_name, model_architecture, tokenizer_name, processor_name):
@skipIf( @skipIf(
checkpoint is None, tokenizer_name is None and processor_name is None,
"checkpoint does not exist, make sure that you defined a `_CHECKPOINT_FOR_DOC` variable in the" f"Ignore {model_architecture.__name__}: no processor class is provided (tokenizer, image processor,"
" modeling file", " feature extractor, etc)",
) )
def test(self): def test(self):
if ModelClass.__name__.endswith("ForCausalLM"): repo_id = f"hf-internal-testing/{repo_name}"
tiny_config.is_encoder_decoder = False
if hasattr(tiny_config, "encoder_no_repeat_ngram_size"): tokenizer = None
# specific for blenderbot which supports both decoder-only if tokenizer_name is not None:
# encoder/decoder but the test config only reflects tokenizer_class = getattr(transformers_module, tokenizer_name)
# encoder/decoder arch tokenizer = tokenizer_class.from_pretrained(repo_id)
tiny_config.encoder_no_repeat_ngram_size = 0
if ModelClass.__name__.endswith("WithLMHead"): processor = None
tiny_config.is_decoder = True if processor_name is not None:
processor_class = getattr(transformers_module, processor_name)
# If the required packages (like `Pillow`) are not installed, this will fail.
try:
processor = processor_class.from_pretrained(repo_id)
except Exception:
self.skipTest(f"Ignore {model_architecture.__name__}: could not load the model from {repo_id}")
try: try:
model = ModelClass(tiny_config) model = model_architecture.from_pretrained(repo_id)
except ImportError as e: except Exception:
self.skipTest( self.skipTest(f"Ignore {model_architecture.__name__}: could not load the model from {repo_id}")
f"Cannot run with {tiny_config} as the model requires a library that isn't installed: {e}"
) # validate
validate_test_components(self, model, tokenizer, processor)
if hasattr(model, "eval"): if hasattr(model, "eval"):
model = model.eval() model = model.eval()
if tokenizer_class is not None:
try: pipeline, examples = self.get_test_pipeline(model, tokenizer, processor)
tokenizer = get_tiny_tokenizer_from_checkpoint(checkpoint)
# XLNet actually defines it as -1.
if model.config.__class__.__name__ in ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS:
tokenizer.model_max_length = model.config.max_position_embeddings - 2
elif (
hasattr(model.config, "max_position_embeddings")
and model.config.max_position_embeddings > 0
):
tokenizer.model_max_length = model.config.max_position_embeddings
# Rust Panic exception are NOT Exception subclass
# Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
# provide some default tokenizer and hope for the best.
except: # noqa: E722
self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
else:
tokenizer = None
feature_extractor = get_tiny_feature_extractor_from_checkpoint(
checkpoint, tiny_config, feature_extractor_class
)
image_processor = get_tiny_image_processor_from_checkpoint(
checkpoint, tiny_config, image_processor_class
)
if tokenizer is None and feature_extractor is None and image_processor:
self.skipTest(
f"Ignoring {ModelClass}, cannot create a tokenizer or feature_extractor or image_processor"
" (PerceiverConfig with no FastTokenizer ?)"
)
pipeline, examples = self.get_test_pipeline(model, tokenizer, feature_extractor, image_processor)
if pipeline is None: if pipeline is None:
# The test can disable itself, but it should be very marginal # The test can disable itself, but it should be very marginal
# Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist) # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
return self.skipTest(f"Ignore {model_architecture.__name__}: could not create the pipeline")
self.run_pipeline_test(pipeline, examples) self.run_pipeline_test(pipeline, examples)
def run_batch_test(pipeline, examples): def run_batch_test(pipeline, examples):
...@@ -294,52 +276,45 @@ class PipelineTestCaseMeta(type): ...@@ -294,52 +276,45 @@ class PipelineTestCaseMeta(type):
return test return test
# Download tiny model summary (used to avoid requesting from Hub too many times)
url = "https://huggingface.co/datasets/hf-internal-testing/tiny-random-model-summary/raw/main/processor_classes.json"
tiny_model_summary = requests.get(url).json()
for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]: for prefix, key in [("pt", "model_mapping"), ("tf", "tf_model_mapping")]:
mapping = dct.get(key, {}) mapping = dct.get(key, {})
if mapping: if mapping:
for configuration, model_architectures in mapping.items(): for config_class, model_architectures in mapping.items():
if not isinstance(model_architectures, tuple): if not isinstance(model_architectures, tuple):
model_architectures = (model_architectures,) model_architectures = (model_architectures,)
for model_architecture in model_architectures: for model_architecture in model_architectures:
checkpoint = get_checkpoint_from_architecture(model_architecture) model_arch_name = model_architecture.__name__
tiny_config = get_tiny_config_from_class(configuration) # Get the canonical name
tokenizer_classes = TOKENIZER_MAPPING.get(configuration, []) for _prefix in ["Flax", "TF"]:
feature_extractor_class = FEATURE_EXTRACTOR_MAPPING.get(configuration, None) if model_arch_name.startswith(_prefix):
feature_extractor_name = ( model_arch_name = model_arch_name[len(_prefix) :]
feature_extractor_class.__name__ if feature_extractor_class else "nofeature_extractor" break
)
image_processor_class = IMAGE_PROCESSOR_MAPPING.get(configuration, None) tokenizer_names = []
image_processor_name = ( processor_names = []
image_processor_class.__name__ if image_processor_class else "noimage_processor" if model_arch_name in tiny_model_summary:
) tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
if not tokenizer_classes: processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
# We need to test even if there are no tokenizers. # Adding `None` (if empty) so we can generate tests
tokenizer_classes = [None] tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
else: processor_names = [None] if len(processor_names) == 0 else processor_names
# Remove the non defined tokenizers
# ByT5 and Perceiver are bytes-level and don't define repo_name = f"tiny-random-{model_arch_name}"
# FastTokenizer, we can just ignore those. for tokenizer_name in tokenizer_names:
tokenizer_classes = [ for processor_name in processor_names:
tokenizer_class for tokenizer_class in tokenizer_classes if tokenizer_class is not None if is_test_to_skip(
] name, config_class, model_architecture, tokenizer_name, processor_name
):
for tokenizer_class in tokenizer_classes: continue
if tokenizer_class is not None: test_name = f"test_{prefix}_{config_class.__name__}_{model_architecture.__name__}_{tokenizer_name}_{processor_name}"
tokenizer_name = tokenizer_class.__name__
else:
tokenizer_name = "notokenizer"
test_name = f"test_{prefix}_{configuration.__name__}_{model_architecture.__name__}_{tokenizer_name}_{feature_extractor_name}_{image_processor_name}"
if tokenizer_class is not None or feature_extractor_class is not None:
dct[test_name] = gen_test( dct[test_name] = gen_test(
model_architecture, repo_name, model_architecture, tokenizer_name, processor_name
checkpoint,
tiny_config,
tokenizer_class,
feature_extractor_class,
image_processor_class,
) )
@abstractmethod @abstractmethod
......
...@@ -53,7 +53,7 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM ...@@ -53,7 +53,7 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
else [] else []
) )
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer)
return conversation_agent, [Conversation("Hi there!")] return conversation_agent, [Conversation("Hi there!")]
......
...@@ -47,8 +47,8 @@ class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCase ...@@ -47,8 +47,8 @@ class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCase
model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
depth_estimator = DepthEstimationPipeline(model=model, feature_extractor=feature_extractor) depth_estimator = DepthEstimationPipeline(model=model, feature_extractor=processor)
return depth_estimator, [ return depth_estimator, [
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
......
...@@ -59,9 +59,9 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=Pipeli ...@@ -59,9 +59,9 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=Pipeli
@require_pytesseract @require_pytesseract
@require_vision @require_vision
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
dqa_pipeline = pipeline( dqa_pipeline = pipeline(
"document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor "document-question-answering", model=model, tokenizer=tokenizer, feature_extractor=processor
) )
image = INVOICE_URL image = INVOICE_URL
......
...@@ -175,7 +175,7 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ...@@ -175,7 +175,7 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
raise ValueError("We expect lists of floats, nothing else") raise ValueError("We expect lists of floats, nothing else")
return shape return shape
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
if tokenizer is None: if tokenizer is None:
self.skipTest("No tokenizer") self.skipTest("No tokenizer")
return return
...@@ -196,9 +196,7 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ...@@ -196,9 +196,7 @@ class FeatureExtractionPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
) )
return return
feature_extractor = FeatureExtractionPipeline( feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor)
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
)
return feature_extractor, ["This is a test", "This is another test"] return feature_extractor, ["This is a test", "This is another test"]
def run_pipeline_test(self, feature_extractor, examples): def run_pipeline_test(self, feature_extractor, examples):
......
...@@ -206,7 +206,7 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -206,7 +206,7 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
unmasker.tokenizer.pad_token = None unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker, []) self.run_pipeline_test(unmasker, [])
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
if tokenizer is None or tokenizer.mask_token_id is None: if tokenizer is None or tokenizer.mask_token_id is None:
self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)") self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
......
...@@ -49,8 +49,8 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -49,8 +49,8 @@ class ImageClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
image_classifier = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2) image_classifier = ImageClassificationPipeline(model=model, feature_extractor=processor, top_k=2)
examples = [ examples = [
Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
"http://images.cocodataset.org/val2017/000000039769.jpg", "http://images.cocodataset.org/val2017/000000039769.jpg",
......
...@@ -81,10 +81,8 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa ...@@ -81,10 +81,8 @@ class ImageSegmentationPipelineTests(unittest.TestCase, metaclass=PipelineTestCa
+ (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else []) + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
} }
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
image_segmenter = ImageSegmentationPipeline( image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor)
model=model, feature_extractor=feature_extractor, image_processor=image_processor
)
return image_segmenter, [ return image_segmenter, [
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
......
...@@ -36,8 +36,8 @@ class ImageToTextPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta ...@@ -36,8 +36,8 @@ class ImageToTextPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, feature_extractor=processor)
examples = [ examples = [
Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
"./tests/fixtures/tests_samples/COCO/000000039769.png", "./tests/fixtures/tests_samples/COCO/000000039769.png",
......
...@@ -51,8 +51,8 @@ else: ...@@ -51,8 +51,8 @@ else:
class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class ObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
object_detector = ObjectDetectionPipeline(model=model, feature_extractor=feature_extractor) object_detector = ObjectDetectionPipeline(model=model, feature_extractor=processor)
return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"] return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
def run_pipeline_test(self, object_detector, examples): def run_pipeline_test(self, object_detector, examples):
......
...@@ -31,7 +31,7 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ...@@ -31,7 +31,7 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING
tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING tf_model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
if isinstance(model.config, LxmertConfig): if isinstance(model.config, LxmertConfig):
# This is an bimodal model, we need to find a more consistent way # This is an bimodal model, we need to find a more consistent way
# to switch on those models. # to switch on those models.
......
...@@ -34,7 +34,7 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe ...@@ -34,7 +34,7 @@ class SummarizationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMe
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer) summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"] return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
......
...@@ -34,7 +34,7 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -34,7 +34,7 @@ class Text2TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTest
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer) generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
return generator, ["Something to write", "Something else"] return generator, ["Something to write", "Something else"]
......
...@@ -129,7 +129,7 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC ...@@ -129,7 +129,7 @@ class TextClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestC
outputs = text_classifier("Birds are a type of animal") outputs = text_classifier("Birds are a type of animal")
self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}]) self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer) text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
return text_classifier, ["HuggingFace is in", "This is another test"] return text_classifier, ["HuggingFace is in", "This is another test"]
......
...@@ -143,7 +143,7 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM ...@@ -143,7 +143,7 @@ class TextGenerationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM
], ],
) )
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer) text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
return text_generator, ["This is a test", "Another test"] return text_generator, ["This is a test", "Another test"]
......
...@@ -37,7 +37,7 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ...@@ -37,7 +37,7 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer) token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
return token_classifier, ["A simple string", "A simple string that is quite a bit longer"] return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
......
...@@ -34,7 +34,7 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta ...@@ -34,7 +34,7 @@ class TranslationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta
model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
if isinstance(model.config, MBartConfig): if isinstance(model.config, MBartConfig):
src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2] src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang) translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment