Unverified Commit c749bd40 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Pipeline testing - using tiny models on Hub (#20426)



* rework pipeline tests

* run pipeline tests

* fix

* fix

* fix

* revert the changes in get_test_pipeline() parameter list

* fix expected error message

* skip a test

* clean up

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent a582cfce
...@@ -35,11 +35,11 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta ...@@ -35,11 +35,11 @@ from .test_pipelines_common import ANY, PipelineTestCaseMeta
class VideoClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class VideoClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
example_video_filepath = hf_hub_download( example_video_filepath = hf_hub_download(
repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset" repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
) )
video_classifier = VideoClassificationPipeline(model=model, feature_extractor=feature_extractor, top_k=2) video_classifier = VideoClassificationPipeline(model=model, feature_extractor=processor, top_k=2)
examples = [ examples = [
example_video_filepath, example_video_filepath,
"https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4", "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
......
...@@ -36,7 +36,7 @@ else: ...@@ -36,7 +36,7 @@ else:
class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): class VisualQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
examples = [ examples = [
{ {
......
...@@ -30,7 +30,7 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT ...@@ -30,7 +30,7 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase, metaclass=PipelineT
model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING tf_model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
classifier = ZeroShotClassificationPipeline( classifier = ZeroShotClassificationPipeline(
model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"] model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
) )
......
...@@ -37,7 +37,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe ...@@ -37,7 +37,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
# and only CLIP would be there for now. # and only CLIP would be there for now.
# model_mapping = {CLIPConfig: CLIPModel} # model_mapping = {CLIPConfig: CLIPModel}
# def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): # def get_test_pipeline(self, model, tokenizer, processor):
# if tokenizer is None: # if tokenizer is None:
# # Side effect of no Fast Tokenizer class for these model, so skipping # # Side effect of no Fast Tokenizer class for these model, so skipping
# # But the slow tokenizer test should still run as they're quite small # # But the slow tokenizer test should still run as they're quite small
...@@ -46,7 +46,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe ...@@ -46,7 +46,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase, metaclass=Pipe
# # return None, None # # return None, None
# image_classifier = ZeroShotImageClassificationPipeline( # image_classifier = ZeroShotImageClassificationPipeline(
# model=model, tokenizer=tokenizer, feature_extractor=feature_extractor # model=model, tokenizer=tokenizer, feature_extractor=processor
# ) # )
# # test with a raw waveform # # test with a raw waveform
......
...@@ -36,7 +36,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=Pipeline ...@@ -36,7 +36,7 @@ class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=Pipeline
model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
def get_test_pipeline(self, model, tokenizer, feature_extractor, image_processor): def get_test_pipeline(self, model, tokenizer, processor):
object_detector = pipeline( object_detector = pipeline(
"zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection" "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
) )
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import argparse import argparse
import collections.abc import collections.abc
import copy
import importlib import importlib
import inspect import inspect
import json import json
...@@ -31,6 +32,7 @@ from huggingface_hub import Repository, create_repo, upload_folder ...@@ -31,6 +32,7 @@ from huggingface_hub import Repository, create_repo, upload_folder
from transformers import ( from transformers import (
CONFIG_MAPPING, CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING, FEATURE_EXTRACTOR_MAPPING,
IMAGE_PROCESSOR_MAPPING,
PROCESSOR_MAPPING, PROCESSOR_MAPPING,
TOKENIZER_MAPPING, TOKENIZER_MAPPING,
AutoTokenizer, AutoTokenizer,
...@@ -74,29 +76,36 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None): ...@@ -74,29 +76,36 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):
We use `tuple` here to include (potentially) both slow & fast tokenizers. We use `tuple` here to include (potentially) both slow & fast tokenizers.
""" """
# To make a uniform return type
def _to_tuple(x):
if not isinstance(x, collections.abc.Sequence):
x = (x,)
else:
x = tuple(x)
return x
if allowed_mappings is None: if allowed_mappings is None:
allowed_mappings = ["processor", "tokenizer", "feature_extractor"] allowed_mappings = ["processor", "tokenizer", "image_processor", "feature_extractor"]
processor_types = () processor_types = ()
# Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers or a feature extractor. # Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers, and/or an image processor or
# a feature extractor
if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings: if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings:
processor_types = PROCESSOR_MAPPING[config_class] processor_types = _to_tuple(PROCESSOR_MAPPING[config_class])
elif config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings: else:
if config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
processor_types = TOKENIZER_MAPPING[config_class] processor_types = TOKENIZER_MAPPING[config_class]
if config_class in IMAGE_PROCESSOR_MAPPING and "image_processor" in allowed_mappings:
processor_types += _to_tuple(IMAGE_PROCESSOR_MAPPING[config_class])
elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings: elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
processor_types = FEATURE_EXTRACTOR_MAPPING[config_class] processor_types += _to_tuple(FEATURE_EXTRACTOR_MAPPING[config_class])
else:
# Some configurations have no processor at all. For example, generic composite models like # Remark: some configurations have no processor at all. For example, generic composite models like
# `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't # `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
# require any processor. # require any processor.
pass
# make a uniform return type
if not isinstance(processor_types, collections.abc.Sequence):
processor_types = (processor_types,)
else:
processor_types = tuple(processor_types)
# We might get `None` for some tokenizers - remove them here. # We might get `None` for some tokenizers - remove them here.
processor_types = tuple(p for p in processor_types if p is not None) processor_types = tuple(p for p in processor_types if p is not None)
...@@ -154,7 +163,7 @@ def get_config_class_from_processor_class(processor_class): ...@@ -154,7 +163,7 @@ def get_config_class_from_processor_class(processor_class):
return new_config_class return new_config_class
def build_processor(config_class, processor_class): def build_processor(config_class, processor_class, allow_no_checkpoint=False):
"""Create a processor for `processor_class`. """Create a processor for `processor_class`.
If a processor is not able to be built with the original arguments, this method tries to change the arguments and If a processor is not able to be built with the original arguments, this method tries to change the arguments and
...@@ -264,6 +273,18 @@ def build_processor(config_class, processor_class): ...@@ -264,6 +273,18 @@ def build_processor(config_class, processor_class):
if config_class_from_processor_class != config_class: if config_class_from_processor_class != config_class:
processor = build_processor(config_class_from_processor_class, processor_class) processor = build_processor(config_class_from_processor_class, processor_class)
# Try to create an image processor or a feature extractor without any checkpoint
if (
processor is None
and allow_no_checkpoint
and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
):
try:
processor = processor_class()
except Exception as e:
logger.error(e)
pass
# validation # validation
if processor is not None: if processor is not None:
if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")): if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
...@@ -458,6 +479,18 @@ def convert_processors(processors, tiny_config, output_folder, result): ...@@ -458,6 +479,18 @@ def convert_processors(processors, tiny_config, output_folder, result):
result["warnings"].append(f"Failed to convert feature extractors: {e}") result["warnings"].append(f"Failed to convert feature extractors: {e}")
feature_extractors = [] feature_extractors = []
if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
if fast_tokenizer is not None:
if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
if slow_tokenizer is not None:
if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
processors = [fast_tokenizer, slow_tokenizer] + feature_extractors processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
processors = [p for p in processors if p is not None] processors = [p for p in processors if p is not None]
for p in processors: for p in processors:
...@@ -491,6 +524,12 @@ def build_model(model_arch, tiny_config, output_dir): ...@@ -491,6 +524,12 @@ def build_model(model_arch, tiny_config, output_dir):
if os.path.isdir(processor_output_dir): if os.path.isdir(processor_output_dir):
shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True) shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True)
tiny_config = copy.deepcopy(tiny_config)
if any([model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]]):
tiny_config.is_encoder_decoder = False
tiny_config.is_decoder = True
model = model_arch(config=tiny_config) model = model_arch(config=tiny_config)
model.save_pretrained(checkpoint_dir) model.save_pretrained(checkpoint_dir)
model.from_pretrained(checkpoint_dir) model.from_pretrained(checkpoint_dir)
...@@ -819,7 +858,7 @@ def build(config_class, models_to_create, output_dir): ...@@ -819,7 +858,7 @@ def build(config_class, models_to_create, output_dir):
for processor_class in processor_classes: for processor_class in processor_classes:
try: try:
processor = build_processor(config_class, processor_class) processor = build_processor(config_class, processor_class, allow_no_checkpoint=True)
if processor is not None: if processor is not None:
result["processor"][processor_class] = processor result["processor"][processor_class] = processor
except Exception as e: except Exception as e:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment