Unverified Commit 2c22bc79 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Make tiny model creation + pipeline testing more robust (#22500)



* Final Tiny things

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 12d51db2
name: Update Tiny Models name: Check Tiny Models
on: on:
push: push:
branches: branches:
- update_tiny_models* - check_tiny_models*
repository_dispatch: repository_dispatch:
schedule: schedule:
- cron: "0 2 * * *" - cron: "0 2 * * *"
...@@ -12,8 +12,8 @@ env: ...@@ -12,8 +12,8 @@ env:
TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }}
jobs: jobs:
update_tiny_models: check_tiny_models:
name: Update tiny models name: Check tiny models
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout transformers - name: Checkout transformers
...@@ -21,14 +21,49 @@ jobs: ...@@ -21,14 +21,49 @@ jobs:
with: with:
fetch-depth: 2 fetch-depth: 2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
# Semantic version range syntax or exact version of a Python version
python-version: '3.8'
# Optional - x64 or x86 architecture, defaults to x64
architecture: 'x64'
- name: Install - name: Install
run: | run: |
python -m pip install -U .[dev] sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake
python -m pip install -U natten pip install --upgrade pip
python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu]
pip install tensorflow_probability
python -m pip install -U natten
- name: Create all tiny models (locally)
run: |
python utils/create_dummy_models.py tiny_local_models --all --num_workers 2
- name: Local tiny model reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_local_model_creation_reports
path: tiny_local_models/reports
# GitHub-hosted runners have 2-core CPUs
- name: Run pipeline tests against all new (local) tiny models
run: |
OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v3
with:
name: tiny_local_model_creation_reports
path: reports/tests_pipelines
- name: Update tiny models - name: Create + Upload tiny models for new model architecture(s)
run: | run: |
python utils/update_tiny_models.py python utils/update_tiny_models.py --num_workers 2
- name: Full report - name: Full report
run: cat tiny_models/reports/tiny_model_creation_report.json run: cat tiny_models/reports/tiny_model_creation_report.json
...@@ -39,7 +74,7 @@ jobs: ...@@ -39,7 +74,7 @@ jobs:
- name: Summary report - name: Summary report
run: cat tiny_models/reports/tiny_model_summary.json run: cat tiny_models/reports/tiny_model_summary.json
- name: Test suite reports artifacts - name: New tiny model creation reports artifacts
if: ${{ always() }} if: ${{ always() }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3
with: with:
......
...@@ -294,6 +294,15 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, ...@@ -294,6 +294,15 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
test_missing_keys = False test_missing_keys = False
test_torchscript = False test_torchscript = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
if pipeline_test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast"):
return True
return False
def setUp(self): def setUp(self):
self.model_tester = LEDModelTester(self) self.model_tester = LEDModelTester(self)
self.config_tester = ConfigTester(self, config_class=LEDConfig) self.config_tester = ConfigTester(self, config_class=LEDConfig)
......
...@@ -265,6 +265,13 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi ...@@ -265,6 +265,13 @@ class NllbMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
test_missing_keys = True test_missing_keys = True
test_torchscript = False test_torchscript = False
# TODO: Fix the failed tests when this model gets more usage
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
# Saving the slow tokenizer after saving the fast tokenizer causes the loading of the later hanging forever.
return True
def setUp(self): def setUp(self):
self.model_tester = NllbMoeModelTester(self) self.model_tester = NllbMoeModelTester(self)
self.config_tester = ConfigTester(self, config_class=NllbMoeConfig) self.config_tester = ConfigTester(self, config_class=NllbMoeConfig)
......
...@@ -230,6 +230,8 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -230,6 +230,8 @@ class SplinterModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
): ):
if pipeline_test_casse_name == "QAPipelineTests": if pipeline_test_casse_name == "QAPipelineTests":
return True return True
elif pipeline_test_casse_name == "FeatureExtractionPipelineTests" and tokenizer_name.endswith("Fast"):
return True
return False return False
......
...@@ -93,7 +93,14 @@ for task, task_info in pipeline_test_mapping.items(): ...@@ -93,7 +93,14 @@ for task, task_info in pipeline_test_mapping.items():
} }
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json") # The default value `hf-internal-testing` is for running the pipeline testing against the tiny models on the Hub.
# For debugging purpose, we can specify a local path which is the `output_path` argument of a previous run of
# `utils/create_dummy_models.py`.
TRANSFORMERS_TINY_MODEL_PATH = os.environ.get("TRANSFORMERS_TINY_MODEL_PATH", "hf-internal-testing")
if TRANSFORMERS_TINY_MODEL_PATH == "hf-internal-testing":
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(Path(__file__).parent.parent, "tests/utils/tiny_model_summary.json")
else:
TINY_MODEL_SUMMARY_FILE_PATH = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, "reports", "tiny_model_summary.json")
with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp: with open(TINY_MODEL_SUMMARY_FILE_PATH) as fp:
tiny_model_summary = json.load(fp) tiny_model_summary = json.load(fp)
...@@ -146,12 +153,15 @@ class PipelineTesterMixin: ...@@ -146,12 +153,15 @@ class PipelineTesterMixin:
if model_arch_name in tiny_model_summary: if model_arch_name in tiny_model_summary:
tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"] tokenizer_names = tiny_model_summary[model_arch_name]["tokenizer_classes"]
processor_names = tiny_model_summary[model_arch_name]["processor_classes"] processor_names = tiny_model_summary[model_arch_name]["processor_classes"]
commit = tiny_model_summary[model_arch_name]["sha"] if "sha" in tiny_model_summary[model_arch_name]:
commit = tiny_model_summary[model_arch_name]["sha"]
# Adding `None` (if empty) so we can generate tests # Adding `None` (if empty) so we can generate tests
tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names tokenizer_names = [None] if len(tokenizer_names) == 0 else tokenizer_names
processor_names = [None] if len(processor_names) == 0 else processor_names processor_names = [None] if len(processor_names) == 0 else processor_names
repo_name = f"tiny-random-{model_arch_name}" repo_name = f"tiny-random-{model_arch_name}"
if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
repo_name = model_arch_name
self.run_model_pipeline_tests( self.run_model_pipeline_tests(
task, repo_name, model_architecture, tokenizer_names, processor_names, commit task, repo_name, model_architecture, tokenizer_names, processor_names, commit
...@@ -210,7 +220,10 @@ class PipelineTesterMixin: ...@@ -210,7 +220,10 @@ class PipelineTesterMixin:
processor_name (`str`): processor_name (`str`):
The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`. The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
""" """
repo_id = f"hf-internal-testing/{repo_name}" repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
model_type = model_architecture.config_class.model_type
repo_id = os.path.join(TRANSFORMERS_TINY_MODEL_PATH, model_type, repo_name)
tokenizer = None tokenizer = None
if tokenizer_name is not None: if tokenizer_name is not None:
......
...@@ -18,6 +18,7 @@ import collections.abc ...@@ -18,6 +18,7 @@ import collections.abc
import copy import copy
import inspect import inspect
import json import json
import multiprocessing
import os import os
import shutil import shutil
import tempfile import tempfile
...@@ -679,12 +680,22 @@ def convert_processors(processors, tiny_config, output_folder, result): ...@@ -679,12 +680,22 @@ def convert_processors(processors, tiny_config, output_folder, result):
if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0: if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
if fast_tokenizer is not None: if fast_tokenizer is not None:
if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]: if fast_tokenizer.__class__.__name__ in [
"RobertaTokenizerFast",
"XLMRobertaTokenizerFast",
"LongformerTokenizerFast",
"MPNetTokenizerFast",
]:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else: else:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
if slow_tokenizer is not None: if slow_tokenizer is not None:
if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]: if slow_tokenizer.__class__.__name__ in [
"RobertaTokenizer",
"XLMRobertaTokenizer",
"LongformerTokenizer",
"MPNetTokenizer",
]:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2 slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else: else:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
...@@ -1047,6 +1058,10 @@ def build(config_class, models_to_create, output_dir): ...@@ -1047,6 +1058,10 @@ def build(config_class, models_to_create, output_dir):
The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under
it. Models in different frameworks with the same architecture will be saved in the same subdirectory. it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
""" """
if data["training_ds"] is None or data["testing_ds"] is None:
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
if config_class.model_type in [ if config_class.model_type in [
"encoder-decoder", "encoder-decoder",
...@@ -1323,6 +1338,7 @@ def create_tiny_models( ...@@ -1323,6 +1338,7 @@ def create_tiny_models(
upload, upload,
organization, organization,
token, token,
num_workers=1,
): ):
clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
if os.getcwd() != clone_path: if os.getcwd() != clone_path:
...@@ -1343,10 +1359,6 @@ def create_tiny_models( ...@@ -1343,10 +1359,6 @@ def create_tiny_models(
pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings] pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings]
tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings] tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings]
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
data["training_ds"] = ds["train"]
data["testing_ds"] = ds["test"]
config_classes = CONFIG_MAPPING.values() config_classes = CONFIG_MAPPING.values()
if not all: if not all:
config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types] config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types]
...@@ -1363,11 +1375,19 @@ def create_tiny_models( ...@@ -1363,11 +1375,19 @@ def create_tiny_models(
to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models} to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models}
results = {} results = {}
for c, models_to_create in list(to_create.items()): if num_workers <= 1:
print(f"Create models for {c.__name__} ...") for c, models_to_create in list(to_create.items()):
result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type)) print(f"Create models for {c.__name__} ...")
results[c.__name__] = result result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
print("=" * 40) results[c.__name__] = result
print("=" * 40)
else:
all_build_args = []
for c, models_to_create in list(to_create.items()):
all_build_args.append((c, models_to_create, os.path.join(output_path, c.model_type)))
with multiprocessing.Pool() as pool:
results = pool.starmap(build, all_build_args)
results = {buid_args[0].__name__: result for buid_args, result in zip(all_build_args, results)}
if upload: if upload:
if organization is None: if organization is None:
...@@ -1426,9 +1446,8 @@ def create_tiny_models( ...@@ -1426,9 +1446,8 @@ def create_tiny_models(
if __name__ == "__main__": if __name__ == "__main__":
ds = load_dataset("wikitext", "wikitext-2-raw-v1") # This has to be `spawn` to avoid hanging forever!
training_ds = ds["train"] multiprocessing.set_start_method("spawn")
testing_ds = ds["test"]
def list_str(values): def list_str(values):
return values.split(",") return values.split(",")
...@@ -1465,6 +1484,7 @@ if __name__ == "__main__": ...@@ -1465,6 +1484,7 @@ if __name__ == "__main__":
"--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access." "--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access."
) )
parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.") parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
args = parser.parse_args() args = parser.parse_args()
...@@ -1480,4 +1500,5 @@ if __name__ == "__main__": ...@@ -1480,4 +1500,5 @@ if __name__ == "__main__":
args.upload, args.upload,
args.organization, args.organization,
args.token, args.token,
args.num_workers,
) )
...@@ -21,8 +21,10 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me ...@@ -21,8 +21,10 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me
""" """
import argparse
import copy import copy
import json import json
import multiprocessing
import os import os
import time import time
...@@ -197,6 +199,13 @@ def update_tiny_model_summary_file(report_path): ...@@ -197,6 +199,13 @@ def update_tiny_model_summary_file(report_path):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
args = parser.parse_args()
# This has to be `spawn` to avoid hanging forever!
multiprocessing.set_start_method("spawn")
output_path = "tiny_models" output_path = "tiny_models"
all = True all = True
model_types = None model_types = None
...@@ -214,6 +223,7 @@ if __name__ == "__main__": ...@@ -214,6 +223,7 @@ if __name__ == "__main__":
upload, upload,
organization, organization,
token=os.environ.get("TOKEN", None), token=os.environ.get("TOKEN", None),
num_workers=args.num_workers,
) )
update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports")) update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment