Unverified Commit 316bf04d authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Improve tiny model creation script (#20119)



* Improve tiny model creation script

* sort the list of models to upload
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 4eb918e6
......@@ -21,11 +21,13 @@ import json
import os
import shutil
import sys
import tempfile
from pathlib import Path
from datasets import load_dataset
from check_config_docstrings import get_checkpoint_from_config_class
from huggingface_hub import Repository, create_repo, upload_folder
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
......@@ -33,17 +35,23 @@ from transformers import (
TOKENIZER_MAPPING,
AutoTokenizer,
LayoutLMv3TokenizerFast,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
logging,
)
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.file_utils import is_tf_available, is_torch_available
from transformers.models.auto.configuration_auto import AutoConfig, model_type_to_module_name
from transformers.models.fsmt import configuration_fsmt
from transformers.processing_utils import ProcessorMixin, transformers_module
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
# make sure tokenizer plays nice with multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.set_verbosity_error()
logging.disable_progress_bar()
logger = logging.get_logger(__name__)
sys.path.append(".")
......@@ -131,7 +139,7 @@ def get_config_class_from_processor_class(processor_class):
"""
processor_prefix = processor_class.__name__
for postfix in ["Processor", "TokenizerFast", "Tokenizer", "FeatureExtractor"]:
for postfix in ["TokenizerFast", "Tokenizer", "ImageProcessor", "FeatureExtractor", "Processor"]:
processor_prefix = processor_prefix.replace(postfix, "")
# `Wav2Vec2CTCTokenizer` -> `Wav2Vec2Config`
......@@ -188,7 +196,11 @@ def build_processor(config_class, processor_class):
logger.error(e)
config = None
if config is not None:
assert isinstance(config, config_class)
if not isinstance(config, config_class):
raise ValueError(
f"`config` (which is of type {config.__class__.__name__}) should be an instance of `config_class`"
f" ({config_class.__name__})!"
)
tokenizer_class = config.tokenizer_class
new_processor_class = None
if tokenizer_class is not None:
......@@ -210,6 +222,11 @@ def build_processor(config_class, processor_class):
]
if len(new_processor_classes) > 0:
new_processor_class = new_processor_classes[0]
# Let's use fast tokenizer if there is any
for x in new_processor_classes:
if x.__name__.endswith("Fast"):
new_processor_class = x
break
processor = build_processor(config_class, new_processor_class)
if processor is None:
......@@ -248,12 +265,16 @@ def build_processor(config_class, processor_class):
# validation
if processor is not None:
assert isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")
if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
raise ValueError(
f"`processor` (which is of type {processor.__class__.__name__}) should be an instance of"
f" {processor_class.__name__} or an Auto class!"
)
return processor
def get_tiny_config(config_class):
def get_tiny_config(config_class, **model_tester_kwargs):
"""Retrieve a tiny configuration from `config_class` using each model's `ModelTester`.
Args:
......@@ -269,25 +290,26 @@ def get_tiny_config(config_class):
# A simple way is to use `inspect.getsourcefile(config_class)`.
config_source_file = inspect.getsourcefile(config_class)
# The modeling file name without prefix (`modeling_`) and postfix (`.py`)
modeling_name = config_source_file.split("/")[-1].replace("configuration_", "").replace(".py", "")
modeling_name = config_source_file.split(os.path.sep)[-1].replace("configuration_", "").replace(".py", "")
try:
print("Importing", model_type_to_module_name(model_type))
module_name = model_type_to_module_name(model_type)
assert modeling_name.startswith(module_name)
if not modeling_name.startswith(module_name):
raise ValueError(f"{modeling_name} doesn't start with {module_name}!")
module = importlib.import_module(f".models.{module_name}.test_modeling_{modeling_name}", package="tests")
camel_case_model_name = config_class.__name__.split("Config")[0]
model_tester_class = getattr(module, f"{camel_case_model_name}ModelTester", None)
except ModuleNotFoundError as e:
error = f"Tiny config not created for {model_type} - cannot find the testing module from the model name."
raise ValueError(f"{error}: {str(e)}")
error = f"Tiny config not created for {model_type} - cannot find the testing module from the model name"
raise ValueError(f"{error}: {e}")
if model_tester_class is None:
error = f"Tiny config not created for {model_type} - no model tester is found in the testing module."
error = f"Tiny config not created for {model_type} - no model tester is found in the testing module"
raise ValueError(error)
# `parent` is an instance of `unittest.TestCase`, but we don't need it here.
model_tester = model_tester_class(parent=None)
model_tester = model_tester_class(parent=None, **model_tester_kwargs)
if hasattr(model_tester, "get_pipeline_config"):
return model_tester.get_pipeline_config()
......@@ -369,8 +391,12 @@ def convert_processors(processors, tiny_config, output_folder, result):
feature_extractors.append(processor.feature_extractor)
# check the built processors have the unique type
assert len(set([x.__class__.__name__ for x in feature_extractors])) < 2
assert len(set([x.__class__.__name__.replace("Fast", "") for x in tokenizers])) < 2
num_types = len(set([x.__class__.__name__ for x in feature_extractors]))
if num_types >= 2:
raise ValueError(f"`feature_extractors` should contain at most 1 type, but it contains {num_types} types!")
num_types = len(set([x.__class__.__name__.replace("Fast", "") for x in tokenizers]))
if num_types >= 2:
raise ValueError(f"`tokenizers` should contain at most 1 tokenizer type, but it contains {num_types} types!")
fast_tokenizer = None
slow_tokenizer = None
......@@ -479,6 +505,56 @@ def fill_result_with_error(result, error, models_to_create):
for model_arch in models_to_create[framework]:
result[framework][model_arch.__name__] = {"model": None, "checkpoint": None, "error": error}
result["processor"] = {type(p).__name__: p.__class__.__name__ for p in result["processor"]}
def upload_model(model_dir, organization):
"""Upload the tiny models"""
arch_name = model_dir.split(os.path.sep)[-1]
repo_name = f"tiny-random-{arch_name}"
repo_exist = False
error = None
try:
create_repo(repo_id=repo_name, organization=organization, exist_ok=False, repo_type="model")
except Exception as e:
error = e
if "You already created" in str(e):
error = None
logger.warning("Remote repository exists and will be cloned.")
repo_exist = True
try:
create_repo(repo_id=repo_name, organization=organization, exist_ok=True, repo_type="model")
except Exception as e:
error = e
if error is not None:
raise ValueError(error)
with tempfile.TemporaryDirectory() as tmpdir:
repo = Repository(local_dir=tmpdir, clone_from=f"{organization}/{repo_name}")
repo.git_pull()
shutil.copytree(model_dir, tmpdir, dirs_exist_ok=True)
if repo_exist:
# Open a PR on the existing Hub repo.
hub_pr_url = upload_folder(
folder_path=model_dir,
repo_id=f"{organization}/{repo_name}",
repo_type="model",
commit_message=f"Update tiny models for {arch_name}",
commit_description=f"Upload tiny models for {arch_name}",
create_pr=True,
)
logger.warning(f"PR open in {hub_pr_url}")
else:
# Push to Hub repo directly
repo.git_add(auto_lfs_track=True)
repo.git_commit(f"Upload tiny models for {arch_name}")
repo.git_push(blocking=True) # this prints a progress bar with the upload
logger.warning(f"Tiny models {arch_name} pushed to {organization}/{repo_name}")
def build_composite_models(config_class, output_dir):
......@@ -620,6 +696,87 @@ def build_composite_models(config_class, output_dir):
return result
def get_token_id_from_tokenizer(token_id_name, tokenizer, original_token_id):
"""Use `tokenizer` to get the values of `bos_token_id`, `eos_token_ids`, etc.
The argument `token_id_name` should be a string ending with `_token_id`, and `original_token_id` should be an
integer that will be return if `tokenizer` has no token corresponding to `token_id_name`.
"""
token_id = original_token_id
if not token_id_name.endswith("_token_id"):
raise ValueError(f"`token_id_name` is {token_id_name}, which doesn't end with `_token_id`!")
token = getattr(tokenizer, token_id_name.replace("_token_id", "_token"), None)
if token is not None:
if isinstance(tokenizer, PreTrainedTokenizerFast):
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
else:
token_id = tokenizer._convert_token_to_id(token)
return token_id
def get_config_overrides(config_class, processors):
config_overrides = {}
# Check if there is any tokenizer (prefer fast version if any)
tokenizer = None
for processor in processors:
if isinstance(processor, PreTrainedTokenizerFast):
tokenizer = processor
break
elif isinstance(processor, PreTrainedTokenizer):
tokenizer = processor
if tokenizer is None:
return config_overrides
# Get some properties of the (already converted) tokenizer (smaller vocab size, special token ids, etc.)
vocab_size = tokenizer.vocab_size
config_overrides["vocab_size"] = vocab_size
# Used to create a new model tester with `tokenizer.vocab_size` in order to get the (updated) special token ids.
model_tester_kwargs = {"vocab_size": vocab_size}
# CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
# `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
if config_class.__name__ in ["CLIPConfig", "GroupViTConfig", "OwlViTConfig", "XCLIPConfig", "FlavaConfig"]:
del model_tester_kwargs["vocab_size"]
model_tester_kwargs["text_kwargs"] = {"vocab_size": vocab_size}
# `FSMTModelTester` accepts `src_vocab_size` and `tgt_vocab_size` but not `vocab_size`.
elif config_class.__name__ == "FSMTConfig":
del model_tester_kwargs["vocab_size"]
model_tester_kwargs["src_vocab_size"] = tokenizer.src_vocab_size
model_tester_kwargs["tgt_vocab_size"] = tokenizer.tgt_vocab_size
_tiny_config = get_tiny_config(config_class, **model_tester_kwargs)
# handle the possibility of `text_config` inside `_tiny_config` for clip-like models (`owlvit`, `groupvit`, etc.)
if hasattr(_tiny_config, "text_config"):
_tiny_config = _tiny_config.text_config
# Collect values of some special token ids
for attr in dir(_tiny_config):
if attr.endswith("_token_id"):
token_id = getattr(_tiny_config, attr)
if token_id is not None:
# Using the token id values from `tokenizer` instead of from `_tiny_config`.
token_id = get_token_id_from_tokenizer(attr, tokenizer, original_token_id=token_id)
config_overrides[attr] = token_id
if config_class.__name__ == "FSMTConfig":
config_overrides["src_vocab_size"] = tokenizer.src_vocab_size
config_overrides["tgt_vocab_size"] = tokenizer.tgt_vocab_size
# `FSMTConfig` has `DecoderConfig` as `decoder` attribute.
config_overrides["decoder"] = configuration_fsmt.DecoderConfig(
vocab_size=tokenizer.tgt_vocab_size, bos_token_id=config_overrides["eos_token_id"]
)
return config_overrides
def build(config_class, models_to_create, output_dir):
"""Create all models for a certain model type.
......@@ -658,9 +815,15 @@ def build(config_class, models_to_create, output_dir):
return result
for processor_class in processor_classes:
processor = build_processor(config_class, processor_class)
if processor is not None:
result["processor"][processor_class] = processor
try:
processor = build_processor(config_class, processor_class)
if processor is not None:
result["processor"][processor_class] = processor
except Exception as e:
error = f"Failed to build processor for {processor_class.__name__}: {e}"
fill_result_with_error(result, error, models_to_create)
logger.error(result["error"])
return result
if len(result["processor"]) == 0:
error = f"No processor could be built for {config_class.__name__}."
......@@ -671,9 +834,7 @@ def build(config_class, models_to_create, output_dir):
try:
tiny_config = get_tiny_config(config_class)
except Exception as e:
# Let's still return the processors, so we know they could be built.
result["processor"] = {type(p).__name__: p.__class__.__name__ for p in result["processor"]}
error = str(e)
error = f"Failed to get tiny config for {config_class.__name__}: {e}"
fill_result_with_error(result, error, models_to_create)
logger.error(result["error"])
return result
......@@ -681,9 +842,11 @@ def build(config_class, models_to_create, output_dir):
# Convert the processors (reduce vocabulary size, smaller image size, etc.)
processors = list(result["processor"].values())
processor_output_folder = os.path.join(output_dir, "processors")
processors = convert_processors(processors, tiny_config, processor_output_folder, result)
# update `result["processor"]`
result["processor"] = {type(p).__name__: p.__class__.__name__ for p in processors}
try:
processors = convert_processors(processors, tiny_config, processor_output_folder, result)
except Exception as e:
error = f"Failed to convert the processors: {e}"
result["warnings"].append(error)
if len(result["processor"]) == 0:
error = f"No processor could be converted for {config_class.__name__}."
......@@ -691,17 +854,23 @@ def build(config_class, models_to_create, output_dir):
logger.error(result["error"])
return result
# Update the config with the properties of the converted processors (smaller vocab size, image size, etc.)
for processor in processors:
if isinstance(processor, PreTrainedTokenizerBase):
vocab_size = processor.vocab_size
result["vocab_size"] = vocab_size
config_overrides = {k: v for k, v in result.items() if k in ["vocab_size"] and v is not None}
# Update `vocab_size`
try:
config_overrides = get_config_overrides(config_class, processors)
except Exception as e:
error = f"Failure occurs while calling `get_config_overrides`: {e}"
fill_result_with_error(result, error, models_to_create)
logger.error(result["error"])
return result
# Just for us to see this easily in the report
if "vocab_size" in config_overrides:
result["vocab_size"] = config_overrides["vocab_size"]
# Update attributes that `vocab_size` involves
for k, v in config_overrides.items():
if hasattr(tiny_config, k):
setattr(tiny_config, k, v)
# currently only `vocab_size` is involved, so let's just look `text_config`
# So far, we only have to deal with `text_config`, as `config_overrides` contains text-related attributes only.
elif (
hasattr(tiny_config, "text_config")
and tiny_config.text_config is not None
......@@ -716,6 +885,9 @@ def build(config_class, models_to_create, output_dir):
if result["warnings"]:
logger.warning(result["warnings"])
# update `result["processor"]`
result["processor"] = {type(p).__name__: p.__class__.__name__ for p in processors}
for pytorch_arch in models_to_create["pytorch"]:
result["pytorch"][pytorch_arch.__name__] = {}
error = None
......@@ -729,7 +901,7 @@ def build(config_class, models_to_create, output_dir):
result["pytorch"][pytorch_arch.__name__]["checkpoint"] = (
get_checkpoint_dir(output_dir, pytorch_arch) if model is not None else None
)
if error:
if error is not None:
result["pytorch"][pytorch_arch.__name__]["error"] = error
logger.error(f"{pytorch_arch.__name__}: {error}")
......@@ -763,7 +935,7 @@ def build(config_class, models_to_create, output_dir):
result["tensorflow"][tensorflow_arch.__name__]["checkpoint"] = (
get_checkpoint_dir(output_dir, tensorflow_arch) if model is not None else None
)
if error:
if error is not None:
result["tensorflow"][tensorflow_arch.__name__]["error"] = error
logger.error(f"{tensorflow_arch.__name__}: {error}")
......@@ -867,6 +1039,13 @@ if __name__ == "__main__":
type=list_str,
help="Comma-separated list of model type(s) from which the tiny models will be created.",
)
parser.add_argument("--upload", action="store_true", help="If to upload the created tiny models to the Hub.")
parser.add_argument(
"--organization",
default=None,
type=str,
help="The organization on the Hub to which the tiny models will be uploaded.",
)
parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
args = parser.parse_args()
......@@ -913,3 +1092,28 @@ if __name__ == "__main__":
with open("simple_failed_report.txt", "w") as fp:
fp.write(failed_report)
if args.upload:
if args.organization is None:
raise ValueError("The argument `organization` could not be `None`. No model is uploaded")
to_upload = []
for model_type in os.listdir(args.output_path):
for arch in os.listdir(os.path.join(args.output_path, model_type)):
if arch == "processors":
continue
to_upload.append(os.path.join(args.output_path, model_type, arch))
to_upload = sorted(to_upload)
upload_results = {}
if len(to_upload) > 0:
for model_dir in to_upload:
try:
upload_model(model_dir, args.organization)
except Exception as e:
error = f"Failed to upload {model_dir}: {e}"
logger.error(error)
upload_results[model_dir] = error
with open("failed_uploads.json", "w") as fp:
json.dump(upload_results, fp, indent=4)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment