Unverified Commit e8cc0255 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Automatically create/update tiny models (#22275)



* Automatically create or update tiny models

* Skip failed tests

* update workflow file

* use revision

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent a92e0ad2
name: Self-hosted runner (push)
on:
push:
branches:
- update_tiny_models*
repository_dispatch:
schedule:
- cron: "0 2 * * *"
env:
TOKEN: ${{ secrets.SYLVAIN_HF_TOKEN }}
jobs:
update_tiny_models:
name: Update tiny models
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install
run: |
python -m pip install -U .[dev]
python -m pip install -U natten
- name: Update tiny models
run: |
python utils/update_tiny_models.py
- name: Full report
run: cat tiny_models/reports/tiny_model_creation_report.json
- name: Failure report
run: cat tiny_models/reports/simple_failed_report.txt
- name: Summary report
run: cat tiny_models/reports/tiny_model_summary.json
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_model_creation_reports
path: tiny_models/reports
......@@ -402,6 +402,15 @@ class AltCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
test_resize_embeddings = False
test_attention_outputs = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
return True
return False
def setUp(self):
self.model_tester = AltCLIPModelTester(self)
......
......@@ -165,6 +165,15 @@ class ASTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
test_resize_embeddings = False
test_head_masking = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "AudioClassificationPipelineTests":
return True
return False
def setUp(self):
self.model_tester = ASTModelTester(self)
self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37)
......
......@@ -237,6 +237,15 @@ class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
test_pruning = False
test_missing_keys = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "TextGenerationPipelineTests":
return True
return False
def setUp(self):
self.model_tester = BlenderbotSmallModelTester(self)
self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
......
......@@ -183,6 +183,15 @@ class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
test_head_masking = False
test_missing_keys = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "ObjectDetectionPipelineTests":
return True
return False
# special case for head models
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
......
......@@ -250,6 +250,15 @@ class ErnieMModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
)
test_torchscript = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "QAPipelineTests":
return True
return False
def setUp(self):
self.model_tester = ErnieMModelTester(self)
self.config_tester = ConfigTester(self, config_class=ErnieMConfig, hidden_size=37)
......
......@@ -231,6 +231,15 @@ class OneFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
test_head_masking = False
test_missing_keys = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "FeatureExtractionPipelineTests":
return True
return False
def setUp(self):
self.model_tester = OneFormerModelTester(self)
self.config_tester = ConfigTester(self, config_class=OneFormerConfig, has_text_modality=False)
......
......@@ -224,6 +224,15 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
else {}
)
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "QAPipelineTests":
return True
return False
def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
inputs_dict = copy.deepcopy(inputs_dict)
if return_labels:
......
......@@ -142,18 +142,22 @@ class PipelineTesterMixin:
tokenizer_names = []
processor_names = []
commit = None
if model_arch_name in tiny_model_summary:
tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
commit = tiny_model_summary[model_arch_name]["sha"]
# Adding `None` (if empty) so we can generate tests
tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
processor_names = [None] if len(processor_names) == 0 else processor_names
repo_name = f"tiny-random-{model_arch_name}"
self.run_model_pipeline_tests(task, repo_name, model_architecture, tokenizer_names, processor_names)
self.run_model_pipeline_tests(
task, repo_name, model_architecture, tokenizer_names, processor_names, commit
)
def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names):
def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit):
"""Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
Args:
......@@ -187,9 +191,9 @@ class PipelineTesterMixin:
f"`{tokenizer_name}` | processor `{processor_name}`."
)
continue
self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name)
self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit)
def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name):
def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit):
"""Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
The model will be loaded from a model repository on the Hub.
......@@ -211,14 +215,14 @@ class PipelineTesterMixin:
tokenizer = None
if tokenizer_name is not None:
tokenizer_class = getattr(transformers_module, tokenizer_name)
tokenizer = tokenizer_class.from_pretrained(repo_id)
tokenizer = tokenizer_class.from_pretrained(repo_id, revision=commit)
processor = None
if processor_name is not None:
processor_class = getattr(transformers_module, processor_name)
# If the required packages (like `Pillow` or `torchaudio`) are not installed, this will fail.
try:
processor = processor_class.from_pretrained(repo_id)
processor = processor_class.from_pretrained(repo_id, revision=commit)
except Exception:
logger.warning(
f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
......@@ -236,7 +240,7 @@ class PipelineTesterMixin:
# TODO: We should check if a model file is on the Hub repo. instead.
try:
model = model_architecture.from_pretrained(repo_id)
model = model_architecture.from_pretrained(repo_id, revision=commit)
except Exception:
logger.warning(
f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
......
This diff is collapsed.
......@@ -27,7 +27,7 @@ from pathlib import Path
from check_config_docstrings import get_checkpoint_from_config_class
from datasets import load_dataset
from get_test_info import get_model_to_tester_mapping, get_tester_classes_for_model
from huggingface_hub import Repository, create_repo, upload_folder
from huggingface_hub import Repository, create_repo, hf_api, upload_folder
from transformers import (
CONFIG_MAPPING,
......@@ -70,6 +70,14 @@ FRAMEWORKS = ["pytorch", "tensorflow"]
INVALID_ARCH = []
TARGET_VOCAB_SIZE = 1024
data = {"training_ds": None, "testing_ds": None}
COMPOSITE_MODELS = {
"EncoderDecoderModel": "EncoderDecoderModel-bert-bert",
"SpeechEncoderDecoderModel": "SpeechEncoderDecoderModel-wav2vec2-bert",
"VisionEncoderDecoderModel": "VisionEncoderDecoderModel-vit-gpt2",
"VisionTextDualEncoderModel": "VisionTextDualEncoderModel-vit-bert",
}
# This list contains the model architectures for which a tiny version could not be created.
# Avoid to add new architectures here - unless we have verified carefully that it's (almost) impossible to create them.
......@@ -179,7 +187,7 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):
return processor_types
def get_architectures_from_config_class(config_class, arch_mappings):
def get_architectures_from_config_class(config_class, arch_mappings, models_to_skip=None):
"""Return a tuple of all possible architectures attributed to a configuration class `config_class`.
For example, BertConfig -> [BertModel, BertForMaskedLM, ..., BertForQuestionAnswering].
......@@ -192,12 +200,16 @@ def get_architectures_from_config_class(config_class, arch_mappings):
# We avoid the duplication.
architectures = set()
if models_to_skip is None:
models_to_skip = []
models_to_skip = UNCONVERTIBLE_MODEL_ARCHITECTURES.union(models_to_skip)
for mapping in arch_mappings:
if config_class in mapping:
models = mapping[config_class]
models = tuple(models) if isinstance(models, collections.abc.Sequence) else (models,)
for model in models:
if model.__name__ not in UNCONVERTIBLE_MODEL_ARCHITECTURES:
if model.__name__ not in models_to_skip:
architectures.add(model)
architectures = tuple(architectures)
......@@ -422,11 +434,13 @@ def get_tiny_config(config_class, model_class=None, **model_tester_kwargs):
def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast):
new_tokenizer = tokenizer_fast.train_new_from_iterator(training_ds["text"], TARGET_VOCAB_SIZE, show_progress=False)
new_tokenizer = tokenizer_fast.train_new_from_iterator(
data["training_ds"]["text"], TARGET_VOCAB_SIZE, show_progress=False
)
# Make sure it at least runs
if not isinstance(new_tokenizer, LayoutLMv3TokenizerFast):
new_tokenizer(testing_ds["text"])
new_tokenizer(data["testing_ds"]["text"])
return new_tokenizer
......@@ -640,16 +654,17 @@ def fill_result_with_error(result, error, trace, models_to_create):
result["processor"] = {p.__class__.__name__: p.__class__.__name__ for p in result["processor"].values()}
def upload_model(model_dir, organization):
def upload_model(model_dir, organization, token):
"""Upload the tiny models"""
arch_name = model_dir.split(os.path.sep)[-1]
repo_name = f"tiny-random-{arch_name}"
repo_id = f"{organization}/{repo_name}"
repo_exist = False
error = None
try:
create_repo(repo_id=f"{organization}/{repo_name}", exist_ok=False, repo_type="model")
create_repo(repo_id=repo_id, exist_ok=False, repo_type="model", token=token)
except Exception as e:
error = e
if "You already created" in str(e):
......@@ -657,14 +672,14 @@ def upload_model(model_dir, organization):
logger.warning("Remote repository exists and will be cloned.")
repo_exist = True
try:
create_repo(repo_id=repo_name, organization=organization, exist_ok=True, repo_type="model")
create_repo(repo_id=repo_id, exist_ok=True, repo_type="model", token=token)
except Exception as e:
error = e
if error is not None:
raise error
with tempfile.TemporaryDirectory() as tmpdir:
repo = Repository(local_dir=tmpdir, clone_from=f"{organization}/{repo_name}")
repo = Repository(local_dir=tmpdir, clone_from=repo_id, token=token)
repo.git_pull()
shutil.copytree(model_dir, tmpdir, dirs_exist_ok=True)
......@@ -672,19 +687,21 @@ def upload_model(model_dir, organization):
# Open a PR on the existing Hub repo.
hub_pr_url = upload_folder(
folder_path=model_dir,
repo_id=f"{organization}/{repo_name}",
repo_id=repo_id,
repo_type="model",
commit_message=f"Update tiny models for {arch_name}",
commit_description=f"Upload tiny models for {arch_name}",
create_pr=True,
token=token,
)
logger.warning(f"PR open in {hub_pr_url}.")
# TODO: We need this information?
else:
# Push to Hub repo directly
repo.git_add(auto_lfs_track=True)
repo.git_commit(f"Upload tiny models for {arch_name}")
repo.git_push(blocking=True) # this prints a progress bar with the upload
logger.warning(f"Tiny models {arch_name} pushed to {organization}/{repo_name}.")
logger.warning(f"Tiny models {arch_name} pushed to {repo_id}.")
def build_composite_models(config_class, output_dir):
......@@ -704,6 +721,7 @@ def build_composite_models(config_class, output_dir):
SpeechEncoderDecoderModel,
TFEncoderDecoderModel,
TFVisionEncoderDecoderModel,
TFVisionTextDualEncoderModel,
VisionEncoderDecoderModel,
VisionTextDualEncoderModel,
ViTConfig,
......@@ -753,7 +771,7 @@ def build_composite_models(config_class, output_dir):
encoder_class = ViTModel
decoder_class = BertModel
model_class = VisionTextDualEncoderModel
tf_model_class = None
tf_model_class = TFVisionTextDualEncoderModel
with tempfile.TemporaryDirectory() as tmpdir:
try:
......@@ -1097,13 +1115,14 @@ def build(config_class, models_to_create, output_dir):
return result
def build_tiny_model_summary(results):
def build_tiny_model_summary(results, organization=None, token=None):
"""Build a summary: a dictionary of the form
{
model architecture name:
{
"tokenizer_classes": [...],
"processor_classes": [...]
"processor_classes": [...],
"model_classes": [...],
}
..
}
......@@ -1111,19 +1130,42 @@ def build_tiny_model_summary(results):
tiny_model_summary = {}
for config_name in results:
processors = [key for key, value in results[config_name]["processor"].items()]
tokenizer_classes = [x for x in processors if x.endswith("TokenizerFast") or x.endswith("Tokenizer")]
processor_classes = [x for x in processors if x not in tokenizer_classes]
tokenizer_classes = sorted([x for x in processors if x.endswith("TokenizerFast") or x.endswith("Tokenizer")])
processor_classes = sorted([x for x in processors if x not in tokenizer_classes])
for framework in FRAMEWORKS:
if framework not in results[config_name]:
continue
for arch_name in results[config_name][framework]:
model_classes = [arch_name]
base_arch_name = arch_name[2:] if arch_name.startswith("TF") else arch_name
# tiny model is not created for `arch_name`
if results[config_name][framework][arch_name] is None:
continue
tiny_model_summary[arch_name] = {
if results[config_name][framework][arch_name]["model"] is None:
model_classes = []
if base_arch_name not in tiny_model_summary:
tiny_model_summary[base_arch_name] = {}
tiny_model_summary[base_arch_name].update(
{
"tokenizer_classes": tokenizer_classes,
"processor_classes": processor_classes,
}
)
tiny_model_summary[base_arch_name]["model_classes"] = sorted(
tiny_model_summary[base_arch_name].get("model_classes", []) + model_classes
)
if organization is not None:
repo_name = f"tiny-random-{base_arch_name}"
# composite models' checkpoints have more precise repo. names on the Hub.
if base_arch_name in COMPOSITE_MODELS:
repo_name = f"tiny-random-{COMPOSITE_MODELS[base_arch_name]}"
repo_id = f"{organization}/{repo_name}"
try:
commit_hash = hf_api.repo_info(repo_id, token=token).sha
except Exception:
# The directory is not created, but processor(s) is/are included in `results`.
logger.warning(f"Failed to get information for {repo_id}.\n{traceback.format_exc()}")
del tiny_model_summary[base_arch_name]
continue
tiny_model_summary[base_arch_name]["sha"] = commit_hash
return tiny_model_summary
......@@ -1176,11 +1218,23 @@ def build_simple_report(results):
return text, failed_text
if __name__ == "__main__":
def create_tiny_models(
output_path,
all,
model_types,
models_to_skip,
no_check,
upload,
organization,
token,
):
clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
if os.getcwd() != clone_path:
raise ValueError(f"This script should be run from the root of the clone of `transformers` {clone_path}")
report_path = os.path.join(output_path, "reports")
os.makedirs(report_path)
_pytorch_arch_mappings = [
x
for x in dir(transformers_module)
......@@ -1189,118 +1243,145 @@ if __name__ == "__main__":
_tensorflow_arch_mappings = [
x for x in dir(transformers_module) if x.startswith("TF_MODEL_") and x.endswith("_MAPPING")
]
# _flax_arch_mappings = [x for x in dir(transformers_module) if x.startswith("FLAX_MODEL_") and x.endswith("_MAPPING")]
pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings]
tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings]
# flax_arch_mappings = [getattr(transformers_module, x) for x in _flax_arch_mappings]
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
training_ds = ds["train"]
testing_ds = ds["test"]
def list_str(values):
return values.split(",")
parser = argparse.ArgumentParser()
parser.add_argument("--all", action="store_true", help="Will create all tiny models.")
parser.add_argument(
"--no_check",
action="store_true",
help="If set, will not check the validity of architectures. Use with caution.",
)
parser.add_argument(
"-m",
"--model_types",
type=list_str,
help="Comma-separated list of model type(s) from which the tiny models will be created.",
)
parser.add_argument("--upload", action="store_true", help="If to upload the created tiny models to the Hub.")
parser.add_argument(
"--organization",
default=None,
type=str,
help="The organization on the Hub to which the tiny models will be uploaded.",
)
parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
args = parser.parse_args()
if not args.all and not args.model_types:
raise ValueError("Please provide at least one model type or pass `--all` to export all architectures.")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
config_classes = CONFIG_MAPPING.values()
if not args.all:
config_classes = [CONFIG_MAPPING[model_type] for model_type in args.model_types]
if not all:
config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types]
# A map from config classes to tuples of processors (tokenizer, feature extractor, processor) classes
processor_type_map = {c: get_processor_types_from_config_class(c) for c in config_classes}
to_create = {
c: {
"processor": processor_type_map[c],
"pytorch": get_architectures_from_config_class(c, pytorch_arch_mappings),
"tensorflow": get_architectures_from_config_class(c, tensorflow_arch_mappings),
# "flax": get_architectures_from_config_class(c, flax_arch_mappings),
}
for c in config_classes
}
to_create = {}
for c in config_classes:
processors = processor_type_map[c]
models = get_architectures_from_config_class(c, pytorch_arch_mappings, models_to_skip)
tf_models = get_architectures_from_config_class(c, tensorflow_arch_mappings, models_to_skip)
if len(models) + len(tf_models) > 0:
to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models}
results = {}
for c, models_to_create in list(to_create.items()):
print(f"Create models for {c.__name__} ...")
result = build(c, models_to_create, output_dir=os.path.join(args.output_path, c.model_type))
result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
results[c.__name__] = result
print("=" * 40)
with open("tiny_model_creation_report.json", "w") as fp:
json.dump(results, fp, indent=4)
if upload:
if organization is None:
raise ValueError("The argument `organization` could not be `None`. No model is uploaded")
to_upload = []
for model_type in os.listdir(output_path):
# This is the directory containing the reports
if model_type == "reports":
continue
for arch in os.listdir(os.path.join(output_path, model_type)):
if arch == "processors":
continue
to_upload.append(os.path.join(output_path, model_type, arch))
to_upload = sorted(to_upload)
upload_results = {}
if len(to_upload) > 0:
for model_dir in to_upload:
try:
upload_model(model_dir, organization, token)
except Exception as e:
error = f"Failed to upload {model_dir}. {e.__class__.__name__}: {e}"
logger.error(error)
upload_results[model_dir] = error
with open(os.path.join(report_path, "failed_uploads.json"), "w") as fp:
json.dump(upload_results, fp, indent=4)
# Build the tiny model summary file. The `tokenizer_classes` and `processor_classes` could be both empty lists.
# When using the items in this file to update the file `tests/utils/tiny_model_summary.json`, the model
# architectures with `tokenizer_classes` and `processor_classes` being both empty should **NOT** be added to
# `tests/utils/tiny_model_summary.json`.
tiny_model_summary = build_tiny_model_summary(results)
with open("tiny_model_summary.json", "w") as fp:
tiny_model_summary = build_tiny_model_summary(results, organization=organization, token=token)
with open(os.path.join(report_path, "tiny_model_summary.json"), "w") as fp:
json.dump(tiny_model_summary, fp, indent=4)
with open(os.path.join(report_path, "tiny_model_creation_report.json"), "w") as fp:
json.dump(results, fp, indent=4)
# Build the warning/failure report (json format): same format as the complete `results` except this contains only
# warnings or errors.
failed_results = build_failed_report(results)
with open("failed_report.json", "w") as fp:
with open(os.path.join(report_path, "failed_report.json"), "w") as fp:
json.dump(failed_results, fp, indent=4)
simple_report, failed_report = build_simple_report(results)
# The simplified report: a .txt file with each line of format:
# {model architecture name}: {OK or error message}
with open("simple_report.txt", "w") as fp:
with open(os.path.join(report_path, "simple_report.txt"), "w") as fp:
fp.write(simple_report)
# The simplified failure report: same above except this only contains line with errors
with open("simple_failed_report.txt", "w") as fp:
with open(os.path.join(report_path, "simple_failed_report.txt"), "w") as fp:
fp.write(failed_report)
if args.upload:
if args.organization is None:
raise ValueError("The argument `organization` could not be `None`. No model is uploaded")
to_upload = []
for model_type in os.listdir(args.output_path):
for arch in os.listdir(os.path.join(args.output_path, model_type)):
if arch == "processors":
continue
to_upload.append(os.path.join(args.output_path, model_type, arch))
to_upload = sorted(to_upload)
if __name__ == "__main__":
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
training_ds = ds["train"]
testing_ds = ds["test"]
upload_results = {}
if len(to_upload) > 0:
for model_dir in to_upload:
try:
upload_model(model_dir, args.organization)
except Exception as e:
error = f"Failed to upload {model_dir}. {e.__class__.__name__}: {e}"
logger.error(error)
upload_results[model_dir] = error
def list_str(values):
return values.split(",")
with open("failed_uploads.json", "w") as fp:
json.dump(upload_results, fp, indent=4)
parser = argparse.ArgumentParser()
parser.add_argument("--all", action="store_true", help="Will create all tiny models.")
parser.add_argument(
"--no_check",
action="store_true",
help="If set, will not check the validity of architectures. Use with caution.",
)
parser.add_argument(
"-m",
"--model_types",
type=list_str,
help="Comma-separated list of model type(s) from which the tiny models will be created.",
)
parser.add_argument(
"--models_to_skip",
type=list_str,
help=(
"Comma-separated list of model class names(s) from which the tiny models won't be created.\nThis is usually"
"the list of model classes that have their tiny versions already uploaded to the Hub."
),
)
parser.add_argument("--upload", action="store_true", help="If to upload the created tiny models to the Hub.")
parser.add_argument(
"--organization",
default=None,
type=str,
help="The organization on the Hub to which the tiny models will be uploaded.",
)
parser.add_argument(
"--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access."
)
parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
args = parser.parse_args()
if not args.all and not args.model_types:
raise ValueError("Please provide at least one model type or pass `--all` to export all architectures.")
create_tiny_models(
args.output_path,
args.all,
args.model_types,
args.models_to_skip,
args.no_check,
args.upload,
args.organization,
args.token,
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A script running `create_dummy_models.py` with a pre-defined set of arguments.
This file is intended to be used in a CI workflow file without the need of specifying arguments. It creates and uploads
tiny models for all model classes (if their tiny versions are not on the Hub yet), as well as produces an updated
version of `tests/utils/tiny_model_summary.json`. That updated file should be merged into the `main` branch of
`transformers` so the pipeline testing will use the latest created/updated tiny models.
"""
import copy
import json
import os
import time
from create_dummy_models import COMPOSITE_MODELS, create_tiny_models
from huggingface_hub import ModelFilter, hf_api
import transformers
from transformers import AutoFeatureExtractor, AutoImageProcessor, AutoTokenizer
from transformers.image_processing_utils import BaseImageProcessor
def get_all_model_names():
model_names = set()
# Each auto modeling files contains multiple mappings. Let's get them in a dynamic way.
for module_name in ["modeling_auto", "modeling_tf_auto", "modeling_flax_auto"]:
module = getattr(transformers.models.auto, module_name, None)
if module is None:
continue
# all mappings in a single auto modeling file
mapping_names = [
x
for x in dir(module)
if x.endswith("_MAPPING_NAMES")
and (x.startswith("MODEL_") or x.startswith("TF_MODEL_") or x.startswith("FLAX_MODEL_"))
]
for name in mapping_names:
mapping = getattr(module, name)
if mapping is not None:
for v in mapping.values():
if isinstance(v, (list, tuple)):
model_names.update(v)
elif isinstance(v, str):
model_names.add(v)
return sorted(model_names)
def get_tiny_model_names_from_repo():
# All model names defined in auto mappings
model_names = set(get_all_model_names())
with open("tests/utils/tiny_model_summary.json") as fp:
tiny_model_info = json.load(fp)
tiny_models_names = set()
for model_base_name in tiny_model_info:
tiny_models_names.update(tiny_model_info[model_base_name]["model_classes"])
# Remove a tiny model name if one of its framework implementation hasn't yet a tiny version on the Hub.
not_on_hub = model_names.difference(tiny_models_names)
for model_name in copy.copy(tiny_models_names):
if not model_name.startswith("TF") and f"TF{model_name}" in not_on_hub:
tiny_models_names.remove(model_name)
elif model_name.startswith("TF") and model_name[2:] in not_on_hub:
tiny_models_names.remove(model_name)
return sorted(tiny_models_names)
def get_tiny_model_summary_from_hub(output_path):
special_models = COMPOSITE_MODELS.values()
# All tiny model base names on Hub
model_names = get_all_model_names()
models = hf_api.list_models(
filter=ModelFilter(
author="hf-internal-testing",
)
)
_models = set()
for x in models:
model = x.modelId
org, model = model.split("/")
if not model.startswith("tiny-random-"):
continue
model = model.replace("tiny-random-", "")
if not model[0].isupper():
continue
if model not in model_names and model not in special_models:
continue
_models.add(model)
models = sorted(_models)
# All tiny model names on Hub
summary = {}
for model in models:
repo_id = f"hf-internal-testing/tiny-random-{model}"
model = model.split("-")[0]
try:
repo_info = hf_api.repo_info(repo_id)
content = {
"tokenizer_classes": set(),
"processor_classes": set(),
"model_classes": set(),
"sha": repo_info.sha,
}
except Exception:
continue
try:
time.sleep(1)
tokenizer_fast = AutoTokenizer.from_pretrained(repo_id)
content["tokenizer_classes"].add(tokenizer_fast.__class__.__name__)
except Exception:
pass
try:
time.sleep(1)
tokenizer_slow = AutoTokenizer.from_pretrained(repo_id, use_fast=False)
content["tokenizer_classes"].add(tokenizer_slow.__class__.__name__)
except Exception:
pass
try:
time.sleep(1)
img_p = AutoImageProcessor.from_pretrained(repo_id)
content["processor_classes"].add(img_p.__class__.__name__)
except Exception:
pass
try:
time.sleep(1)
feat_p = AutoFeatureExtractor.from_pretrained(repo_id)
if not isinstance(feat_p, BaseImageProcessor):
content["processor_classes"].add(feat_p.__class__.__name__)
except Exception:
pass
try:
time.sleep(1)
model_class = getattr(transformers, model)
m = model_class.from_pretrained(repo_id)
content["model_classes"].add(m.__class__.__name__)
except Exception:
pass
try:
time.sleep(1)
model_class = getattr(transformers, f"TF{model}")
m = model_class.from_pretrained(repo_id)
content["model_classes"].add(m.__class__.__name__)
except Exception:
pass
content["tokenizer_classes"] = sorted(content["tokenizer_classes"])
content["processor_classes"] = sorted(content["processor_classes"])
content["model_classes"] = sorted(content["model_classes"])
summary[model] = content
with open(os.path.join(output_path, "hub_tiny_model_summary.json"), "w") as fp:
json.dump(summary, fp, ensure_ascii=False, indent=4)
def update_tiny_model_summary_file(report_path):
with open(os.path.join(report_path, "tiny_model_summary.json")) as fp:
new_data = json.load(fp)
with open("tests/utils/tiny_model_summary.json") as fp:
data = json.load(fp)
for key, value in new_data.items():
if key not in data:
data[key] = value
else:
for attr in ["tokenizer_classes", "processor_classes", "model_classes"]:
# we might get duplication here. We will remove them below when creating `updated_data`.
data[key][attr].extend(value[attr])
new_sha = value["sha"]
if new_sha is not None:
data[key]["sha"] = new_sha
updated_data = {}
for key in sorted(data.keys()):
updated_data[key] = {}
for attr, value in data[key].items():
# deduplication and sort
updated_data[key][attr] = sorted(set(value)) if attr != "sha" else value
with open(os.path.join(report_path, "updated_tiny_model_summary.json"), "w") as fp:
json.dump(updated_data, fp, indent=4, ensure_ascii=False)
if __name__ == "__main__":
output_path = "tiny_models"
all = True
model_types = None
models_to_skip = get_tiny_model_names_from_repo()
no_check = True
upload = True
organization = "hf-internal-testing"
create_tiny_models(
output_path,
all,
model_types,
models_to_skip,
no_check,
upload,
organization,
token=os.environ.get("TOKEN", None),
)
update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment