Unverified Commit 826f0457 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Model templates encoder only (#8509)



* Model templates

* TensorFlow

* Remove pooler

* CI

* Tokenizer + Refactoring

* Encoder-Decoder

* Let's go testing

* Encoder-Decoder in TF

* Let's go testing in TF

* Documentation

* README

* Fixes

* Better names

* Style

* Update docs

* Choose to skip either TF or PT

* Code quality fixes

* Add to testing suite

* Update file path

* Cookiecutter path

* Update `transformers` path

* Handle rebasing

* Remove seq2seq from model templates

* Remove s2s config

* Apply Sylvain and Patrick comments

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Last fixes from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 42e2d02e
...@@ -4,10 +4,12 @@ on: ...@@ -4,10 +4,12 @@ on:
push: push:
branches: branches:
- master - master
- model-templates
paths: paths:
- "src/**" - "src/**"
- "tests/**" - "tests/**"
- ".github/**" - ".github/**"
- "templates/**"
# pull_request: # pull_request:
repository_dispatch: repository_dispatch:
...@@ -55,6 +57,14 @@ jobs: ...@@ -55,6 +57,14 @@ jobs:
python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Create model files
run: |
source .env/bin/activate
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/encoder-bert-tokenizer.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/pt-encoder-bert-tokenizer.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/standalone.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/tf-encoder-bert-tokenizer.json --path=templates/cookiecutter
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 1
...@@ -116,6 +126,14 @@ jobs: ...@@ -116,6 +126,14 @@ jobs:
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Create model files
run: |
source .env/bin/activate
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/encoder-bert-tokenizer.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/pt-encoder-bert-tokenizer.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/standalone.json --path=templates/cookiecutter
transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/tf-encoder-bert-tokenizer.json --path=templates/cookiecutter
- name: Run all non-slow tests on GPU - name: Run all non-slow tests on GPU
env: env:
OMP_NUM_THREADS: 1 OMP_NUM_THREADS: 1
......
...@@ -98,12 +98,13 @@ else: ...@@ -98,12 +98,13 @@ else:
extras["tokenizers"] = ["tokenizers==0.9.2"] extras["tokenizers"] = ["tokenizers==0.9.2"]
extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"] extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"]
extras["modelcreation"] = ["cookiecutter==1.7.2"]
extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
extras["sentencepiece"] = ["sentencepiece==0.1.91"] extras["sentencepiece"] = ["sentencepiece==0.1.91"]
extras["retrieval"] = ["faiss-cpu", "datasets"] extras["retrieval"] = ["faiss-cpu", "datasets"]
extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"]
# sphinx-rtd-theme==0.5.0 introduced big changes in the style. # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
extras["docs"] = ["recommonmark", "sphinx==3.2.1", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"] extras["docs"] = ["recommonmark", "sphinx==3.2.1", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"] extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"]
...@@ -111,7 +112,7 @@ extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"] ...@@ -111,7 +112,7 @@ extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"]
extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"] extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"]
extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["docs"] + extras["sklearn"] extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["docs"] + extras["sklearn"] + extras["modelcreation"]
setup( setup(
......
import json
import os
import shutil
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import List
from cookiecutter.main import cookiecutter
from transformers.commands import BaseTransformersCLICommand
from ..utils import logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def add_new_model_command_factory(args: Namespace):
return AddNewModelCommand(args.testing, args.testing_file, path=args.path)
class AddNewModelCommand(BaseTransformersCLICommand):
@staticmethod
def register_subcommand(parser: ArgumentParser):
add_new_model_parser = parser.add_parser("add-new-model")
add_new_model_parser.add_argument("--testing", action="store_true", help="If in testing mode.")
add_new_model_parser.add_argument("--testing_file", type=str, help="Configuration file on which to run.")
add_new_model_parser.add_argument(
"--path", type=str, help="Path to cookiecutter. Should only be used for testing purposes."
)
add_new_model_parser.set_defaults(func=add_new_model_command_factory)
def __init__(self, testing: bool, testing_file: str, path=None, *args):
self._testing = testing
self._testing_file = testing_file
self._path = path
def run(self):
# Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory
directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]]
if len(directories) > 0:
raise ValueError(
"Several directories starting with `cookiecutter-template-` in current working directory. "
"Please clean your directory by removing all folders startign with `cookiecutter-template-` or "
"change your working directory."
)
path_to_transformer_root = (
Path(__file__).parent.parent.parent.parent if self._path is None else Path(self._path).parent.parent
)
path_to_cookiecutter = path_to_transformer_root / "templates" / "cookiecutter"
# Execute cookiecutter
if not self._testing:
cookiecutter(str(path_to_cookiecutter))
else:
with open(self._testing_file, "r") as configuration_file:
testing_configuration = json.load(configuration_file)
cookiecutter(
str(path_to_cookiecutter if self._path is None else self._path),
no_input=True,
extra_context=testing_configuration,
)
directory = [directory for directory in os.listdir() if "cookiecutter-template-" in directory[:22]][0]
# Retrieve configuration
with open(directory + "/configuration.json", "r") as configuration_file:
configuration = json.load(configuration_file)
lowercase_model_name = configuration["lowercase_modelname"]
pytorch_or_tensorflow = configuration["generate_tensorflow_and_pytorch"]
os.remove(f"{directory}/configuration.json")
output_pytorch = "PyTorch" in pytorch_or_tensorflow
output_tensorflow = "TensorFlow" in pytorch_or_tensorflow
shutil.move(
f"{directory}/configuration_{lowercase_model_name}.py",
f"{path_to_transformer_root}/src/transformers/configuration_{lowercase_model_name}.py",
)
def remove_copy_lines(path):
with open(path, "r") as f:
lines = f.readlines()
with open(path, "w") as f:
for line in lines:
if "# Copied from transformers." not in line:
f.write(line)
if output_pytorch:
if not self._testing:
remove_copy_lines(f"{directory}/modeling_{lowercase_model_name}.py")
shutil.move(
f"{directory}/modeling_{lowercase_model_name}.py",
f"{path_to_transformer_root}/src/transformers/modeling_{lowercase_model_name}.py",
)
shutil.move(
f"{directory}/test_modeling_{lowercase_model_name}.py",
f"{path_to_transformer_root}/tests/test_modeling_{lowercase_model_name}.py",
)
else:
os.remove(f"{directory}/modeling_{lowercase_model_name}.py")
os.remove(f"{directory}/test_modeling_{lowercase_model_name}.py")
if output_tensorflow:
if not self._testing:
remove_copy_lines(f"{directory}/modeling_tf_{lowercase_model_name}.py")
shutil.move(
f"{directory}/modeling_tf_{lowercase_model_name}.py",
f"{path_to_transformer_root}/src/transformers/modeling_tf_{lowercase_model_name}.py",
)
shutil.move(
f"{directory}/test_modeling_tf_{lowercase_model_name}.py",
f"{path_to_transformer_root}/tests/test_modeling_tf_{lowercase_model_name}.py",
)
else:
os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
shutil.move(
f"{directory}/{lowercase_model_name}.rst",
f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
)
shutil.move(
f"{directory}/tokenization_{lowercase_model_name}.py",
f"{path_to_transformer_root}/src/transformers/tokenization_{lowercase_model_name}.py",
)
from os import fdopen, remove
from shutil import copymode, move
from tempfile import mkstemp
def replace(original_file: str, line_to_copy_below: str, lines_to_copy: List[str]):
# Create temp file
fh, abs_path = mkstemp()
line_found = False
with fdopen(fh, "w") as new_file:
with open(original_file) as old_file:
for line in old_file:
new_file.write(line)
if line_to_copy_below in line:
line_found = True
for line_to_copy in lines_to_copy:
new_file.write(line_to_copy)
if not line_found:
raise ValueError(f"Line {line_to_copy_below} was not found in file.")
# Copy the file permissions from the old file to the new file
copymode(original_file, abs_path)
# Remove original file
remove(original_file)
# Move new file
move(abs_path, original_file)
def skip_units(line):
return ("generating PyTorch" in line and not output_pytorch) or (
"generating TensorFlow" in line and not output_tensorflow
)
def replace_in_files(path_to_datafile):
with open(path_to_datafile) as datafile:
lines_to_copy = []
skip_file = False
skip_snippet = False
for line in datafile:
if "# To replace in: " in line and "##" not in line:
file_to_replace_in = line.split('"')[1]
skip_file = skip_units(line)
elif "# Below: " in line and "##" not in line:
line_to_copy_below = line.split('"')[1]
skip_snippet = skip_units(line)
elif "# End." in line and "##" not in line:
if not skip_file and not skip_snippet:
replace(file_to_replace_in, line_to_copy_below, lines_to_copy)
lines_to_copy = []
elif "# Replace with" in line and "##" not in line:
lines_to_copy = []
elif "##" not in line:
lines_to_copy.append(line)
remove(path_to_datafile)
replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
os.rmdir(directory)
#!/usr/bin/env python #!/usr/bin/env python
from argparse import ArgumentParser from argparse import ArgumentParser
from transformers.commands.add_new_model import AddNewModelCommand
from transformers.commands.convert import ConvertCommand from transformers.commands.convert import ConvertCommand
from transformers.commands.download import DownloadCommand from transformers.commands.download import DownloadCommand
from transformers.commands.env import EnvironmentCommand from transformers.commands.env import EnvironmentCommand
...@@ -20,6 +21,7 @@ def main(): ...@@ -20,6 +21,7 @@ def main():
RunCommand.register_subcommand(commands_parser) RunCommand.register_subcommand(commands_parser)
ServeCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser)
UserCommands.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser)
AddNewModelCommand.register_subcommand(commands_parser)
# Let's go # Let's go
args = parser.parse_args() args = parser.parse_args()
......
...@@ -59,6 +59,7 @@ from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfi ...@@ -59,6 +59,7 @@ from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfi
ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
(key, value) (key, value)
for pretrained_map in [ for pretrained_map in [
# Add archive maps here
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
...@@ -95,6 +96,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( ...@@ -95,6 +96,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
CONFIG_MAPPING = OrderedDict( CONFIG_MAPPING = OrderedDict(
[ [
# Add configs here
("retribert", RetriBertConfig), ("retribert", RetriBertConfig),
("t5", T5Config), ("t5", T5Config),
("mobilebert", MobileBertConfig), ("mobilebert", MobileBertConfig),
...@@ -136,6 +138,7 @@ CONFIG_MAPPING = OrderedDict( ...@@ -136,6 +138,7 @@ CONFIG_MAPPING = OrderedDict(
MODEL_NAMES_MAPPING = OrderedDict( MODEL_NAMES_MAPPING = OrderedDict(
[ [
# Add full (and cased) model names here
("retribert", "RetriBERT"), ("retribert", "RetriBERT"),
("t5", "T5"), ("t5", "T5"),
("mobilebert", "MobileBERT"), ("mobilebert", "MobileBERT"),
......
...@@ -226,11 +226,14 @@ from .modeling_xlnet import ( ...@@ -226,11 +226,14 @@ from .modeling_xlnet import (
from .utils import logging from .utils import logging
# Add modeling imports here
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
MODEL_MAPPING = OrderedDict( MODEL_MAPPING = OrderedDict(
[ [
# Base model mapping
(RetriBertConfig, RetriBertModel), (RetriBertConfig, RetriBertModel),
(T5Config, T5Model), (T5Config, T5Model),
(DistilBertConfig, DistilBertModel), (DistilBertConfig, DistilBertModel),
...@@ -266,6 +269,7 @@ MODEL_MAPPING = OrderedDict( ...@@ -266,6 +269,7 @@ MODEL_MAPPING = OrderedDict(
MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
[ [
# Model for pre-training mapping
(LayoutLMConfig, LayoutLMForMaskedLM), (LayoutLMConfig, LayoutLMForMaskedLM),
(RetriBertConfig, RetriBertModel), (RetriBertConfig, RetriBertModel),
(T5Config, T5ForConditionalGeneration), (T5Config, T5ForConditionalGeneration),
...@@ -295,6 +299,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( ...@@ -295,6 +299,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
[ [
# Model with LM heads mapping
(LayoutLMConfig, LayoutLMForMaskedLM), (LayoutLMConfig, LayoutLMForMaskedLM),
(T5Config, T5ForConditionalGeneration), (T5Config, T5ForConditionalGeneration),
(DistilBertConfig, DistilBertForMaskedLM), (DistilBertConfig, DistilBertForMaskedLM),
...@@ -325,6 +330,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( ...@@ -325,6 +330,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
[ [
# Model for Causal LM mapping
(CamembertConfig, CamembertForCausalLM), (CamembertConfig, CamembertForCausalLM),
(XLMRobertaConfig, XLMRobertaForCausalLM), (XLMRobertaConfig, XLMRobertaForCausalLM),
(RobertaConfig, RobertaForCausalLM), (RobertaConfig, RobertaForCausalLM),
...@@ -347,6 +353,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( ...@@ -347,6 +353,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
[ [
# Model for Masked LM mapping
(LayoutLMConfig, LayoutLMForMaskedLM), (LayoutLMConfig, LayoutLMForMaskedLM),
(DistilBertConfig, DistilBertForMaskedLM), (DistilBertConfig, DistilBertForMaskedLM),
(AlbertConfig, AlbertForMaskedLM), (AlbertConfig, AlbertForMaskedLM),
...@@ -368,6 +375,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( ...@@ -368,6 +375,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
[ [
# Model for Seq2Seq Causal LM mapping
(T5Config, T5ForConditionalGeneration), (T5Config, T5ForConditionalGeneration),
(PegasusConfig, PegasusForConditionalGeneration), (PegasusConfig, PegasusForConditionalGeneration),
(MarianConfig, MarianMTModel), (MarianConfig, MarianMTModel),
...@@ -383,6 +391,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( ...@@ -383,6 +391,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
[ [
# Model for Sequence Classification mapping
(DistilBertConfig, DistilBertForSequenceClassification), (DistilBertConfig, DistilBertForSequenceClassification),
(AlbertConfig, AlbertForSequenceClassification), (AlbertConfig, AlbertForSequenceClassification),
(CamembertConfig, CamembertForSequenceClassification), (CamembertConfig, CamembertForSequenceClassification),
...@@ -407,6 +416,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -407,6 +416,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[ [
# Model for Question Answering mapping
(DistilBertConfig, DistilBertForQuestionAnswering), (DistilBertConfig, DistilBertForQuestionAnswering),
(AlbertConfig, AlbertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering),
(CamembertConfig, CamembertForQuestionAnswering), (CamembertConfig, CamembertForQuestionAnswering),
...@@ -429,6 +439,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( ...@@ -429,6 +439,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
[ [
# Model for Token Classification mapping
(LayoutLMConfig, LayoutLMForTokenClassification), (LayoutLMConfig, LayoutLMForTokenClassification),
(DistilBertConfig, DistilBertForTokenClassification), (DistilBertConfig, DistilBertForTokenClassification),
(CamembertConfig, CamembertForTokenClassification), (CamembertConfig, CamembertForTokenClassification),
...@@ -450,6 +461,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -450,6 +461,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[ [
# Model for Multiple Choice mapping
(CamembertConfig, CamembertForMultipleChoice), (CamembertConfig, CamembertForMultipleChoice),
(ElectraConfig, ElectraForMultipleChoice), (ElectraConfig, ElectraForMultipleChoice),
(XLMRobertaConfig, XLMRobertaForMultipleChoice), (XLMRobertaConfig, XLMRobertaForMultipleChoice),
......
...@@ -169,11 +169,14 @@ from .modeling_tf_xlnet import ( ...@@ -169,11 +169,14 @@ from .modeling_tf_xlnet import (
from .utils import logging from .utils import logging
# Add modeling imports here
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
TF_MODEL_MAPPING = OrderedDict( TF_MODEL_MAPPING = OrderedDict(
[ [
# Base model mapping
(LxmertConfig, TFLxmertModel), (LxmertConfig, TFLxmertModel),
(T5Config, TFT5Model), (T5Config, TFT5Model),
(DistilBertConfig, TFDistilBertModel), (DistilBertConfig, TFDistilBertModel),
...@@ -200,6 +203,7 @@ TF_MODEL_MAPPING = OrderedDict( ...@@ -200,6 +203,7 @@ TF_MODEL_MAPPING = OrderedDict(
TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
[ [
# Model for pre-training mapping
(LxmertConfig, TFLxmertForPreTraining), (LxmertConfig, TFLxmertForPreTraining),
(T5Config, TFT5ForConditionalGeneration), (T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM), (DistilBertConfig, TFDistilBertForMaskedLM),
...@@ -224,6 +228,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( ...@@ -224,6 +228,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
[ [
# Model with LM heads mapping
(T5Config, TFT5ForConditionalGeneration), (T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM), (DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM),
...@@ -249,6 +254,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( ...@@ -249,6 +254,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
[ [
# Model for Causal LM mapping
(BertConfig, TFBertLMHeadModel), (BertConfig, TFBertLMHeadModel),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel), (GPT2Config, TFGPT2LMHeadModel),
...@@ -264,6 +270,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( ...@@ -264,6 +270,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
[ [
# Model for Masked LM mapping
(DistilBertConfig, TFDistilBertForMaskedLM), (DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM),
(CamembertConfig, TFCamembertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM),
...@@ -282,6 +289,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( ...@@ -282,6 +289,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
[ [
# Model for Seq2Seq Causal LM mapping
(T5Config, TFT5ForConditionalGeneration), (T5Config, TFT5ForConditionalGeneration),
(MarianConfig, TFMarianMTModel), (MarianConfig, TFMarianMTModel),
(MBartConfig, TFMBartForConditionalGeneration), (MBartConfig, TFMBartForConditionalGeneration),
...@@ -293,6 +301,7 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( ...@@ -293,6 +301,7 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
[ [
# Model for Sequence Classification mapping
(DistilBertConfig, TFDistilBertForSequenceClassification), (DistilBertConfig, TFDistilBertForSequenceClassification),
(AlbertConfig, TFAlbertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification),
(CamembertConfig, TFCamembertForSequenceClassification), (CamembertConfig, TFCamembertForSequenceClassification),
...@@ -310,6 +319,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -310,6 +319,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[ [
# Model for Question Answering mapping
(DistilBertConfig, TFDistilBertForQuestionAnswering), (DistilBertConfig, TFDistilBertForQuestionAnswering),
(AlbertConfig, TFAlbertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering),
(CamembertConfig, TFCamembertForQuestionAnswering), (CamembertConfig, TFCamembertForQuestionAnswering),
...@@ -328,6 +338,7 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( ...@@ -328,6 +338,7 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
[ [
# Model for Token Classification mapping
(DistilBertConfig, TFDistilBertForTokenClassification), (DistilBertConfig, TFDistilBertForTokenClassification),
(AlbertConfig, TFAlbertForTokenClassification), (AlbertConfig, TFAlbertForTokenClassification),
(CamembertConfig, TFCamembertForTokenClassification), (CamembertConfig, TFCamembertForTokenClassification),
...@@ -345,6 +356,7 @@ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -345,6 +356,7 @@ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[ [
# Model for Multiple Choice mapping
(CamembertConfig, TFCamembertForMultipleChoice), (CamembertConfig, TFCamembertForMultipleChoice),
(XLMConfig, TFXLMForMultipleChoice), (XLMConfig, TFXLMForMultipleChoice),
(XLMRobertaConfig, TFXLMRobertaForMultipleChoice), (XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
......
# How to add a new model in 🤗 Transformers # Using `cookiecutter` to generate models
This folder describes the process to add a new model in 🤗 Transformers and provide templates for the required files. This folder contains templates to generate new models that fit the current API and pass all tests. It generates
models in both PyTorch and TensorFlow, completes the `__init__.py` and auto-modeling files, and creates the
The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model documentation.
usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in
the present repository. ## Usage
One important point though is that the library has the following goals impacting the way models are incorporated: Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. Let's first clone the
repository and install it in our environment:
- One specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus
often have to be slightly adapted to allow for running in the python interpreter. ```shell script
- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In git clone https://github.com/huggingface/transformers
consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the cd transformers
inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include pip install -e ".[dev]"
`sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a ```
new one.
Once the installation is done, you can use the CLI command `add-new-model` to generate your models:
For a quick overview of the general philosphy of the library and its organization, please check the
[QuickStart section of the documentation](https://huggingface.co/transformers/philosophy.html). ```shell script
transformers-cli add-new-model
# Typical workflow for including a model ```
Here an overview of the general workflow: This should launch the `cookiecutter` package which should prompt you to fill in the configuration.
- [ ] Add model/configuration/tokenization classes. The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa.
- [ ] Add conversion scripts. ```
- [ ] Add tests and a @slow integration test. modelname [<ModelNAME>]:
- [ ] Document your model. uppercase_modelname [<MODEL_NAME>]:
- [ ] Finalize. lowercase_modelname [<model_name>]:
camelcase_modelname [<ModelName>]:
Let's detail what should be done at each step. ```
## Adding model/configuration/tokenization classes Fill in the `authors` with your team members:
```
Here is the workflow for adding model/configuration/tokenization classes: authors [The HuggingFace Team]:
```
- [ ] Copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model
name. The checkpoint identifier is the checkpoint that will be used in the examples across the files. Put the name you wish,
- [ ] Edit the files to replace `XXX` (with various casing) with your model name. as it will appear on the modelhub. Do not forget to include the organisation.
- [ ] Copy-paste or create a simple configuration class for your model in the `configuration_...` file. ```
- [ ] Copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0). checkpoint_identifier [organisation/<model_name>-base-cased]:
- [ ] Copy-paste or create a tokenizer class for your model in the `tokenization_...` file. ```
## Adding conversion scripts The tokenizer should either be based on BERT if it behaves exactly like the BERT tokenizer, or a standalone otherwise.
```
Here is the workflow for the conversion scripts: Select tokenizer_type:
1 - Based on BERT
- [ ] Copy the conversion script (`convert_...`) from the present folder to the main folder. 2 - Standalone
- [ ] Edit this script to convert your original checkpoint weights to the current pytorch ones. Choose from 1, 2 [1]:
```
## Adding tests: <!---
Choose if your model is an encoder-decoder, or an encoder-only architecture.
Here is the workflow for the adding tests:
If your model is an encoder-only architecture, the generated architecture will be based on the BERT model.
- [ ] Copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main If your model is an encoder-decoder architecture, the generated architecture will be based on the BART model. You can,
folder and rename them, replacing `xxx` with your model name. of course, edit the files once the generation is complete.
- [ ] Edit the tests files to replace `XXX` (with various casing) with your model name. ```
- [ ] Edit the tests code as needed. Select is_encoder_decoder_model:
1 - True
## Documenting your model: 2 - False
Choose from 1, 2 [1]:
Here is the workflow for documentation: ```
-->
- [ ] Make sure all your arguments are properly documented in your configuration and tokenizer.
- [ ] Most of the documentation of the models is automatically generated, you just have to make sure that Once the command has finished, you should have a total of 7 new files spread across the repository:
`XXX_START_DOCSTRING` contains an introduction to the model you're adding and a link to the original ```
article and that `XXX_INPUTS_DOCSTRING` contains all the inputs of your model. docs/source/model_doc/<model_name>.rst
- [ ] Create a new page `xxx.rst` in the folder `docs/source/model_doc` and add this file in `docs/source/index.rst`. src/transformers/configuration_<model_name>.py
src/transformers/modeling_<model_name>.py
Make sure to check you have no sphinx warnings when building the documentation locally and follow our src/transformers/modeling_tf_<model_name>.py
[documentation guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification). src/transformers/tokenization_<model_name>.py
tests/test_modeling_<model_name>.py
## Final steps tests/test_modeling_tf_<model_name>.py
```
You can then finish the addition step by adding imports for your classes in the common files:
You can run the tests to ensure that they all pass:
- [ ] Add import for all the relevant classes in `__init__.py`.
- [ ] Add your configuration in `configuration_auto.py`. ```
- [ ] Add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`. python -m pytest ./tests/test_*<model_name>*.py
- [ ] Add your tokenizer in `tokenization_auto.py`. ```
- [ ] Add a link to your conversion script in the main conversion utility (in `commands/convert.py`)
- [ ] Edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` Feel free to modify each file to mimic the behavior of your model.
file.
- [ ] Add a mention of your model in the doc: `README.md` and the documentation itself ⚠ You should be careful about the classes preceded by the following line:️
in `docs/source/pretrained_models.rst`. Rune `make fix-copies` to update `docs/source/index.rst` with your changes.
- [ ] Upload the pretrained weights, configurations and vocabulary files. ```python
- [ ] Create model card(s) for your models on huggingface.co. For those last two steps, check the # Copied from transformers.[...]
[model sharing documentation](https://huggingface.co/transformers/model_sharing.html). ```
This line ensures that the copy does not diverge from the source. If it *should* diverge, because the implementation
is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`,
your changes will be overwritten.
Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change
is needed!) afterwards to make sure everything works as expected.
Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution
will be merged quickly:
- You should run the `make fixup` utility to fix the style of the files and to ensure the code quality meets the
library's standards.
- You should complete the documentation file (`docs/source/model_doc/<model_name>.rst`) so that your model may be
usable.
\ No newline at end of file
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert XXX checkpoint."""
import argparse
import logging
import torch
from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
logging.basicConfig(level=logging.INFO)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
# Initialise PyTorch model
config = XxxConfig.from_json_file(config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = XxxForPreTraining(config)
# Load weights from tf checkpoint
load_tf_weights_in_xxx(model, config, tf_checkpoint_path)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \n"
"This specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
{
"modelname": "{{cookiecutter.modelname}}",
"uppercase_modelname": "{{cookiecutter.uppercase_modelname}}",
"lowercase_modelname": "{{cookiecutter.lowercase_modelname}}",
"camelcase_modelname": "{{cookiecutter.camelcase_modelname}}",
"authors": "{{cookiecutter.authors}}",
"checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}",
"tokenizer_type": "{{cookiecutter.tokenizer_type}}",
"generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}"
}
# coding=utf-8 # coding=utf-8
# Copyright 2010, XXX authors # Copyright {{cookiecutter.authors}} and The HuggingFace Inc. team.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,29 +12,26 @@ ...@@ -12,29 +12,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" XXX model configuration """ """ {{cookiecutter.modelname}} model configuration """
import logging
from typing import Callable, Union
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .utils import logging
logger = logging.getLogger(__name__) logger = logging.get_logger(__name__)
XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = { {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json", "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/config.json",
"xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json", # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}}
} }
class XxxConfig(PretrainedConfig): class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
r""" r"""
This is the configuration class to store the configuration of a :class:`~transformers.XxxModel` or a This is the configuration class to store the configuration of a :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`.
:class:`~transformers.TFXxxModel`. It is used to instantiate a XXX model according to the specified It is used to instantiate an {{cookiecutter.modelname}} model according to the specified arguments, defining the model
arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
configuration to that of the XXX `xxx-base-uncased <https://huggingface.co/xxx/xxx-base-uncased>`__ architecture. the {{cookiecutter.modelname}} `{{cookiecutter.checkpoint_identifier}} <https://huggingface.co/{{cookiecutter.checkpoint_identifier}}>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
...@@ -43,66 +40,90 @@ class XxxConfig(PretrainedConfig): ...@@ -43,66 +40,90 @@ class XxxConfig(PretrainedConfig):
Args: Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522): vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the XXX model. Defines the number of different tokens that can be represented by the Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.XxxModel` or :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
:class:`~transformers.TFXxxModel`. :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
Vocabulary size of the model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`.
hidden_size (:obj:`int`, `optional`, defaults to 768): hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, `optional`, defaults to 12): num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, `optional`, defaults to 12): num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512): max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2): type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.XxxModel` or The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
:class:`~transformers.TFXxxModel`. :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02): initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the :obj:`truncated_normal_initializer` for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5): layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass. Example::
kwargs:
Additional arguments for common configurations, passed to :class:`~transformers.PretrainedConfig`. >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config
>>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
>>> configuration = {{cookiecutter.camelcase_modelname}}Config()
>>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
>>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
""" """
model_type = "xxx" model_type = "{{cookiecutter.lowercase_modelname}}"
def __init__( def __init__(
self, self,
vocab_size: int = 50257, vocab_size=30522,
hidden_size: int = 1024, hidden_size=768,
num_hidden_layers: int = 12, is_encoder_decoder=False,
num_attention_heads: int = 12, num_hidden_layers=12,
hidden_act: Union[str, Callable] = "gelu", num_attention_heads=12,
hidden_dropout_prob: float = 0.1, intermediate_size=3072,
attention_probs_dropout_prob: float = 0.1, hidden_act="gelu",
max_position_embeddings: int = 512, hidden_dropout_prob=0.1,
type_vocab_size: int = 2, attention_probs_dropout_prob=0.1,
initializer_range: float = 0.02, max_position_embeddings=512,
layer_norm_epsilon: float = 1e-5, type_vocab_size=2,
gradient_checkpointing: bool = False, initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs **kwargs
): ):
super().__init__(**kwargs) super().__init__(
pad_token_id=pad_token_id,
is_encoder_decoder=is_encoder_decoder,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs
)
self.vocab_size = vocab_size self.vocab_size = vocab_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_eps = layer_norm_eps
self.gradient_checkpointing = gradient_checkpointing
# coding=utf-8 # coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright 2018 {{cookiecutter.authors}} and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -13,15 +12,13 @@ ...@@ -13,15 +12,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" TF 2.0 XXX model. """ """ TF 2.0 {{cookiecutter.modelname}} model. """
####################################################
# In this template, replace all the XXX (various casings) with your model name
####################################################
import tensorflow as tf import tensorflow as tf
from .configuration_xxx import XxxConfig from .activations_tf import get_tf_activation
from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings, add_code_sample_docstrings,
...@@ -29,6 +26,7 @@ from .file_utils import ( ...@@ -29,6 +26,7 @@ from .file_utils import (
add_start_docstrings_to_model_forward, add_start_docstrings_to_model_forward,
) )
from .modeling_tf_outputs import ( from .modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling, TFBaseModelOutputWithPooling,
TFMaskedLMOutput, TFMaskedLMOutput,
TFMultipleChoiceModelOutput, TFMultipleChoiceModelOutput,
...@@ -43,6 +41,7 @@ from .modeling_tf_utils import ( ...@@ -43,6 +41,7 @@ from .modeling_tf_utils import (
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
TFSequenceClassificationLoss, TFSequenceClassificationLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
TFSequenceSummary,
get_initializer, get_initializer,
keras_serializable, keras_serializable,
shape_list, shape_list,
...@@ -53,72 +52,437 @@ from .utils import logging ...@@ -53,72 +52,437 @@ from .utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "XXXConfig" _CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
_TOKENIZER_FOR_DOC = "XxxTokenizer" _TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
#################################################### TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
# This list contrains shortcut names for some of "{{cookiecutter.checkpoint_identifier}}",
# the pretrained weights provided with the models # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}}
####################################################
TF_XXX_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xxx-base-uncased",
"xxx-large-uncased",
] ]
#################################################### # Copied from transformers.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
# TF 2.0 Models are constructed using Keras imperative API by sub-classing class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer):
# - tf.keras.layers.Layer for the layers and """Construct the embeddings from word, position and token_type embeddings."""
# - TFPreTrainedModel for the models (itself a sub-class of tf.keras.Model)
####################################################
#################################################### def __init__(self, config, **kwargs):
# Here is an example of typical layer in a TF 2.0 model of the library super().__init__(**kwargs)
# The classes are usually identical to the PyTorch ones and prefixed with 'TF'.
# self.vocab_size = config.vocab_size
# Note that class __init__ parameters includes **kwargs (send to 'super'). self.hidden_size = config.hidden_size
# This let us have a control on class scope and variable names: self.initializer_range = config.initializer_range
# More precisely, we set the names of the class attributes (lower level layers) to self.position_embeddings = tf.keras.layers.Embedding(
# to the equivalent attributes names in the PyTorch model so we can have equivalent config.max_position_embeddings,
# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other. config.hidden_size,
# embeddings_initializer=get_initializer(self.initializer_range),
# See the conversion methods in modeling_tf_pytorch_utils.py for more details name="position_embeddings",
#################################################### )
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size,
config.hidden_size,
embeddings_initializer=get_initializer(self.initializer_range),
name="token_type_embeddings",
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call(
self,
input_ids=None,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
elif mode == "linear":
return self._linear(input_ids)
else:
raise ValueError("mode {} is not valid.".format(mode))
TFXxxAttention = tf.keras.layers.Layer def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False):
"""Applies embedding based on inputs tensor."""
assert not (input_ids is None and inputs_embeds is None)
TFXxxIntermediate = tf.keras.layers.Layer if input_ids is not None:
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
TFXxxOutput = tf.keras.layers.Layer seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
class TFXxxLayer(tf.keras.layers.Layer): if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype)
token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
# Copied from transformers.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.attention = TFXxxAttention(config, name="attention")
self.intermediate = TFXxxIntermediate(config, name="intermediate")
self.transformer_output = TFXxxOutput(config, name="output")
def call(self, inputs, training=False): if config.hidden_size % config.num_attention_heads != 0:
hidden_states, attention_mask, head_mask = inputs raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
assert config.hidden_size % config.num_attention_heads == 0
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
)
self.key = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
)
self.value = tf.keras.layers.Dense(
self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
)
self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) def transpose_for_scores(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
batch_size = shape_list(hidden_states)[0]
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = tf.matmul(
query_layer, key_layer, transpose_b=True
) # (batch size, num_heads, seq_len_q, seq_len_k)
dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores
attention_scores = attention_scores / tf.math.sqrt(dk)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs, training=training)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = tf.matmul(attention_probs, value_layer)
context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
context_layer = tf.reshape(
context_layer, (batch_size, -1, self.all_head_size)
) # (batch_size, seq_len_q, all_head_size)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# Copied from transformers.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.self_attention = TF{{cookiecutter.camelcase_modelname}}SelfAttention(config, name="self")
self.dense_output = TF{{cookiecutter.camelcase_modelname}}SelfOutput(config, name="output")
def prune_heads(self, heads):
raise NotImplementedError
def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False):
self_outputs = self.self_attention(
input_tensor, attention_mask, head_mask, output_attentions, training=training
)
attention_output = self.dense_output(self_outputs[0], input_tensor, training=training)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# Copied from transformers.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = get_tf_activation(config.hidden_act)
else:
self.intermediate_act_fn = config.hidden_act
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# Copied from transformers.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def call(self, hidden_states, input_tensor, training=False):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# Copied from transformers.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.attention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="attention")
self.intermediate = TF{{cookiecutter.camelcase_modelname}}Intermediate(config, name="intermediate")
self.{{cookiecutter.lowercase_modelname}}_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output")
def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False):
attention_outputs = self.attention(
hidden_states, attention_mask, head_mask, output_attentions, training=training
)
attention_output = attention_outputs[0] attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output) intermediate_output = self.intermediate(attention_output)
layer_output = self.transformer_output([intermediate_output, attention_output], training=training) layer_output = self.{{cookiecutter.lowercase_modelname}}_output(intermediate_output, attention_output, training=training)
outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them
return outputs return outputs
#################################################### class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer):
# The full model without a specific pretrained or finetuning head is def __init__(self, config, **kwargs):
# provided as a tf.keras.layers.Layer usually called "TFXxxMainLayer" super().__init__(**kwargs)
####################################################
self.layer = [TF{{cookiecutter.camelcase_modelname}}Layer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(
self,
hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(
hidden_states, attention_mask, head_mask[i], output_attentions, training=training
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# Copied from transformers.modeling_tf_bert.TFBertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str):
self.transform_act_fn = get_tf_activation(config.hidden_act)
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
def call(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
# Copied from transformers.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform")
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def call(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
# Copied from transformers.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions")
def call(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class TF{{cookiecutter.camelcase_modelname}}NSPHead(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.seq_relationship = tf.keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
)
def call(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
@keras_serializable @keras_serializable
class TFXxxMainLayer(tf.keras.layers.Layer): class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer):
config_class = {{cookiecutter.camelcase_modelname}}Config
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TF{{cookiecutter.camelcase_modelname}}Embeddings(config, name="embeddings")
self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder")
self.config = config
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings return self.embeddings
...@@ -127,7 +491,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -127,7 +491,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
self.embeddings.vocab_size = value.shape[0] self.embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
raise NotImplementedError # Not implemented yet in the library for TF 2.0 models """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
def call( def call(
self, self,
...@@ -182,9 +550,12 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -182,9 +550,12 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
if attention_mask is None: if attention_mask is None:
attention_mask = tf.fill(input_shape, 1) attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0) token_type_ids = tf.fill(input_shape, 0)
embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length] # Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
...@@ -197,8 +568,7 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -197,8 +568,7 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
# positions we want to attend and -10000.0 for masked positions. # positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is # Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely. # effectively the same as removing these entirely.
extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype)
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
# Prepare head mask if needed # Prepare head mask if needed
...@@ -212,7 +582,6 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -212,7 +582,6 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, embedding_output,
extended_attention_mask, extended_attention_mask,
...@@ -224,43 +593,31 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -224,43 +593,31 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
if not return_dict: if not return_dict:
return ( return (
sequence_output, sequence_output,
pooled_output,
) + encoder_outputs[1:] ) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling( return TFBaseModelOutput(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions, attentions=encoder_outputs.attentions,
) )
#################################################### # Copied from transformers.modeling_tf_bert.TFBertPreTrainedModel with Bert->{{cookiecutter.camelcase_modelname}}
# TFXxxPreTrainedModel is a sub-class of tf.keras.Model class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
# which take care of loading and saving pretrained weights
# and various common utilities.
# Here you just need to specify a few (self-explanatory)
# pointers for your model.
####################################################
class TFXxxPreTrainedModel(TFPreTrainedModel):
"""An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = XxxConfig config_class = {{cookiecutter.camelcase_modelname}}Config
base_model_prefix = "transformer" base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
XXX_START_DOCSTRING = r"""
The XXX model was proposed in {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`__ by....
This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input generic methods the library implements for all its model (such as downloading or saving, resizing the input
...@@ -289,18 +646,18 @@ XXX_START_DOCSTRING = r""" ...@@ -289,18 +646,18 @@ XXX_START_DOCSTRING = r"""
- a dictionary with one or several input Tensors associated to the input names given in the docstring: - a dictionary with one or several input Tensors associated to the input names given in the docstring:
:obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
Parameters: Args:
config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
XXX_INPUTS_DOCSTRING = r""" {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BertTokenizer`. Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.__call__` and See :func:`transformers.PreTrainedTokenizer.__call__` and
:func:`transformers.PreTrainedTokenizer.encode` for details. :func:`transformers.PreTrainedTokenizer.encode` for details.
...@@ -310,7 +667,7 @@ XXX_INPUTS_DOCSTRING = r""" ...@@ -310,7 +667,7 @@ XXX_INPUTS_DOCSTRING = r"""
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**. - 0 for tokens that are **maked**.
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
...@@ -352,41 +709,50 @@ XXX_INPUTS_DOCSTRING = r""" ...@@ -352,41 +709,50 @@ XXX_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare XXX Model transformer outputting raw hidden-states without any specific head on top.", "The bare {{cookiecutter.modelname}} Model transformer outputing raw hidden-states without any specific head on top.",
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class TFXxxModel(TFXxxPreTrainedModel): class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXxxMainLayer(config, name="transformer")
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
@add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFBaseModelOutputWithPooling, output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs) outputs = self.{{cookiecutter.lowercase_modelname}}(inputs, **kwargs)
return outputs
return outputs
TFXxxMLMHead = tf.keras.layers.Layer
@add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMaskedLanguageModelingLoss):
@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING)
class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXxxMainLayer(config, name="transformer") if config.is_decoder:
self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") logger.warning(
"If you want to use `TF{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls")
def get_output_embeddings(self):
return self.{{cookiecutter.lowercase_modelname}}.embeddings
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFMaskedLMOutput, output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -411,7 +777,8 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -411,7 +777,8 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[9] if len(inputs) > 9 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 9: if len(inputs) > 9:
...@@ -419,7 +786,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -419,7 +786,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
inputs, inputs,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -434,11 +801,10 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -434,11 +801,10 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
loss = None if labels is None else self.compute_loss(labels, prediction_scores) loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if not return_dict: if not return_dict:
output = (prediction_scores,) + outputs[2:] output = (prediction_scores,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
...@@ -449,32 +815,55 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -449,32 +815,55 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
) )
class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.out_proj = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
)
self.config = config
def call(self, inputs, **kwargs):
x = inputs[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = get_tf_activation(self.config.hidden_act)(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings( @add_start_docstrings(
"""XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top
the pooled output) e.g. for GLUE tasks. """, e.g., for GLUE tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificationLoss): class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.classifier = TF{{cookiecutter.camelcase_modelname}}ClassificationHead(config, name="classifier")
self.transformer = TFXxxMainLayer(config, name="transformer") @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFSequenceClassifierOutput, output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
def call( def call(
self, self,
inputs=None, inputs,
attention_mask=None, attention_mask=None,
token_type_ids=None, token_type_ids=None,
position_ids=None, position_ids=None,
...@@ -493,15 +882,17 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -493,15 +882,17 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.config.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[9] if len(inputs) > 9 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 9: if len(inputs) > 9:
inputs = inputs[:9] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
inputs, inputs,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -513,16 +904,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -513,16 +904,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
return_dict=return_dict, return_dict=return_dict,
training=training, training=training,
) )
logits = self.classifier(outputs[0])
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
loss = None if labels is None else self.compute_loss(labels, logits) loss = None if labels is None else self.compute_loss(labels, logits)
if not return_dict: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
...@@ -534,33 +921,36 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -534,33 +921,36 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a multiple choice classification head on top (a linear layer on top of """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXxxMainLayer(config, name="transformer") self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary"
)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@property @property
def dummy_inputs(self): def dummy_inputs(self):
"""Dummy inputs to build the network. """
Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
""" """
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFMultipleChoiceModelOutput, output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -583,7 +973,6 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -583,7 +973,6 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension
of the input tensors. (See :obj:`input_ids` above) of the input tensors. (See :obj:`input_ids` above)
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -611,7 +1000,8 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -611,7 +1000,8 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
assert len(inputs) <= 10, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.config.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -629,8 +1019,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -629,8 +1019,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
if inputs_embeds is not None if inputs_embeds is not None
else None else None
) )
outputs = self.{{cookiecutter.lowercase_modelname}}(
flat_inputs = [
flat_input_ids, flat_input_ids,
flat_attention_mask, flat_attention_mask,
flat_token_type_ids, flat_token_type_ids,
...@@ -639,21 +1028,17 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -639,21 +1028,17 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict, return_dict=return_dict,
] training=training,
)
outputs = self.transformer(flat_inputs, training=training) logits = self.sequence_summary(outputs[0])
logits = self.classifier(logits)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
loss = None if labels is None else self.compute_loss(labels, reshaped_logits) loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if not return_dict: if not return_dict:
output = (reshaped_logits,) + outputs[2:] output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
...@@ -663,27 +1048,27 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -663,27 +1048,27 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a token classification head on top (a linear layer on top of """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLoss): class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXxxMainLayer(config, name="transformer") self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFTokenClassifierOutput, output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -706,7 +1091,8 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -706,7 +1091,8 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[9] if len(inputs) > 9 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 9: if len(inputs) > 9:
...@@ -714,7 +1100,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -714,7 +1100,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
inputs, inputs,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -726,16 +1112,13 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -726,16 +1112,13 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
return_dict=return_dict, return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
loss = None if labels is None else self.compute_loss(labels, logits) loss = None if labels is None else self.compute_loss(labels, logits)
if not return_dict: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
...@@ -747,24 +1130,25 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -747,24 +1130,25 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`). """, layer on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXxxMainLayer(config, name="transformer") self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}")
self.qa_outputs = tf.keras.layers.Dense( self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TFQuestionAnsweringModelOutput, output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -793,7 +1177,8 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -793,7 +1177,8 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Positions are clamped to the length of the sequence (:obj:`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[9] if len(inputs) > 9 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[10] if len(inputs) > 10 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
...@@ -803,7 +1188,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -803,7 +1188,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
inputs, inputs,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -815,22 +1200,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -815,22 +1200,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
return_dict=return_dict, return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output) logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
loss = None loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, (start_logits, end_logits)) loss = self.compute_loss(labels, (start_logits, end_logits))
if not return_dict: if not return_dict:
output = (start_logits, end_logits) + outputs[2:] output = (start_logits, end_logits) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput( return TFQuestionAnsweringModelOutput(
......
# coding=utf-8 # coding=utf-8
# Copyright 2018 XXX Authors # Copyright 2020 {{cookiecutter.authors}} and The HuggingFace Inc. team.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,22 +12,27 @@ ...@@ -12,22 +12,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" PyTorch XXX model. """ """ PyTorch {{cookiecutter.modelname}} model. """
####################################################
# In this template, replace all the XXX (various casings) with your model name
####################################################
import math
import os import os
import warnings
import torch import torch
import torch.utils.checkpoint
from torch import nn from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_xxx import XxxConfig from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from .file_utils import (
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
)
from .modeling_outputs import ( from .modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPooling, BaseModelOutputWithPooling,
MaskedLMOutput, MaskedLMOutput,
MultipleChoiceModelOutput, MultipleChoiceModelOutput,
...@@ -35,30 +40,29 @@ from .modeling_outputs import ( ...@@ -35,30 +40,29 @@ from .modeling_outputs import (
SequenceClassifierOutput, SequenceClassifierOutput,
TokenClassifierOutput, TokenClassifierOutput,
) )
from .modeling_utils import PreTrainedModel from .modeling_utils import (
PreTrainedModel,
SequenceSummary,
apply_chunking_to_forward,
find_pruneable_heads_and_indices,
prune_linear_layer,
)
from .utils import logging from .utils import logging
from .activations import ACT2FN
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "XXXConfig" _CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config"
_TOKENIZER_FOR_DOC = "XXXTokenizer" _TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer"
#################################################### {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [
# This list contrains shortcut names for some of "{{cookiecutter.checkpoint_identifier}}",
# the pretrained weights provided with the models # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}}
####################################################
XXX_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xxx-base-uncased",
"xxx-large-uncased",
] ]
#################################################### def load_tf_weights_in_{{cookiecutter.lowercase_modelname}}(model, config, tf_checkpoint_path):
# This is a conversion method from TF 1.0 to PyTorch
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
####################################################
def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model.""" """Load tf checkpoints in a pytorch model."""
try: try:
import re import re
...@@ -123,7 +127,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): ...@@ -123,7 +127,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
try: try:
assert ( assert (
pointer.shape == array.shape pointer.shape == array.shape
), f"Pointer and array have mismatched shapes {pointer.shape} and {array.shape}" ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
except AssertionError as e: except AssertionError as e:
e.args += (pointer.shape, array.shape) e.args += (pointer.shape, array.shape)
raise raise
...@@ -132,69 +136,411 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): ...@@ -132,69 +136,411 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
return model return model
#################################################### def mish(x):
# PyTorch Models are constructed by sub-classing return x * torch.tanh(nn.functional.softplus(x))
# - torch.nn.Module for the layers and
# - PreTrainedModel for the models (itself a sub-class of torch.nn.Module)
####################################################
####################################################
# Here is an example of typical layer in a PyTorch model of the library
# The classes are usually identical to the TF 2.0 ones without the 'TF' prefix.
#
# See the conversion methods in modeling_tf_pytorch_utils.py for more details
####################################################
XxxAttention = nn.Module # Copied from transformers.modeling_bert.BertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}Embeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, :seq_length]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
# Copied from transformers.modeling_bert.BertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=False,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
if encoder_hidden_states is not None:
mixed_key_layer = self.key(encoder_hidden_states)
mixed_value_layer = self.value(encoder_hidden_states)
attention_mask = encoder_attention_mask
else:
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in {{cookiecutter.camelcase_modelname}}Model forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
XxxIntermediate = nn.Module outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.modeling_bert.BertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}SelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# Copied from transformers.modeling_bert.BertAttention with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = {{cookiecutter.camelcase_modelname}}SelfAttention(config)
self.output = {{cookiecutter.camelcase_modelname}}SelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
XxxOutput = nn.Module # Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
class XxxLayer(nn.Module): def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=False,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# Copied from transformers.modeling_bert.BertIntermediate with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}Intermediate(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.attention = XxxAttention(config) self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
self.intermediate = XxxIntermediate(config) if isinstance(config.hidden_act, str):
self.output = XxxOutput(config) self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# Copied from transformers.modeling_bert.BertOutput with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}Output(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# Copied from transformers.modeling_bert.BertLayer with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = {{cookiecutter.camelcase_modelname}}Attention(config)
self.is_decoder = config.is_decoder
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
self.crossattention = {{cookiecutter.camelcase_modelname}}Attention(config)
self.intermediate = {{cookiecutter.camelcase_modelname}}Intermediate(config)
self.output = {{cookiecutter.camelcase_modelname}}Output(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=False,
):
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
if self.is_decoder and encoder_hidden_states is not None:
assert hasattr(
self, "crossattention"
), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
cross_attention_outputs = self.crossattention(
attention_output,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
)
attention_output = cross_attention_outputs[0]
outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def forward(self, hidden_states, attention_mask=None, head_mask=None): def feed_forward_chunk(self, attention_output):
attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
attention_output = attention_outputs[0]
intermediate_output = self.intermediate(attention_output) intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output) layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them return layer_output
return outputs
#################################################### # Copied from transformers.modeling_bert.BertEncoder with Bert->{{cookiecutter.camelcase_modelname}}
# PreTrainedModel is a sub-class of torch.nn.Module class {{cookiecutter.camelcase_modelname}}Encoder(nn.Module):
# which take care of loading and saving pretrained weights def __init__(self, config):
# and various common utilities. super().__init__()
# self.config = config
# Here you just need to specify a few (self-explanatory) self.layer = nn.ModuleList([{{cookiecutter.camelcase_modelname}}Layer(config) for _ in range(config.num_hidden_layers)])
# pointers for your model and the weights initialization
# method if its not fully covered by PreTrainedModel's default method def forward(
#################################################### self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=False,
output_hidden_states=False,
return_dict=False,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
XxxLayerNorm = torch.nn.LayerNorm if getattr(self.config, "gradient_checkpointing", False):
XxxEmbeddings = nn.Module def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs, output_attentions)
XxxEncoder = nn.Module return custom_forward
XxxPooler = nn.Module layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
class XxxPreTrainedModel(PreTrainedModel): if not return_dict:
"""An abstract class to handle weights initialization and return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
)
# Copied from transformers.modeling_bert.BertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}LMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->{{cookiecutter.camelcase_modelname}}
class {{cookiecutter.camelcase_modelname}}OnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = {{cookiecutter.camelcase_modelname}}LMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = XxxConfig config_class = {{cookiecutter.camelcase_modelname}}Config
load_tf_weights = load_tf_weights_in_xxx load_tf_weights = load_tf_weights_in_{{cookiecutter.lowercase_modelname}}
base_model_prefix = "transformer" base_model_prefix = "{{cookiecutter.lowercase_modelname}}"
authorized_missing_keys = [r"position_ids"]
def _init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights """ """ Initialize the weights """
...@@ -202,74 +548,64 @@ class XxxPreTrainedModel(PreTrainedModel): ...@@ -202,74 +548,64 @@ class XxxPreTrainedModel(PreTrainedModel):
# Slightly different from the TF version which uses truncated_normal for initialization # Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617 # cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, XxxLayerNorm): elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None: if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_() module.bias.data.zero_()
XXX_START_DOCSTRING = r""" {{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
The XXX model was proposed in `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`__ by....
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
usage and behavior. usage and behavior.
Parameters: Parameters:
config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
XXX_INPUTS_DOCSTRING = r""" {{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r"""
Inputs: Args:
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.XxxTokenizer`. Indices can be obtained using :class:`transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`.
See :meth:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:meth:`transformers.PreTrainedTokenizer.__call__` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
Mask to avoid performing attention on padding token indices. Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**. - 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
Indices are selected in ``[0, 1]``: 1]``:
- 0 corresponds to a `sentence A` token, - 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token. - 1 corresponds to a `sentence B` token.
`What are token type IDs? <../glossary.html#token-type-ids>`_ `What are token type IDs? <../glossary.html#token-type-ids>`_
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings. Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``. Selected in the range ``[0, config.max_position_embeddings - 1]``.
`What are position IDs? <../glossary.html#position-ids>`_ `What are position IDs? <../glossary.html#position-ids>`_
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**. - 0 indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated This is useful if you want more control over how to convert `input_ids` indices into associated vectors
vectors than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`): output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
tensors for more detail. tensors for more detail.
...@@ -282,24 +618,39 @@ XXX_INPUTS_DOCSTRING = r""" ...@@ -282,24 +618,39 @@ XXX_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare XXX Model transformer outputting raw hidden-states without any specific head on top.", "The bare {{cookiecutter.modelname}} Model transformer outputting raw hidden-states without any specific head on top.",
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class XxxModel(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelname}}PreTrainedModel):
"""
The model can behave as an encoder (with only self-attention) as well
as a decoder, in which case a layer of cross-attention is added between
the self-attention layers, following the architecture described in `Attention is
all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
To behave as an decoder the model needs to be initialized with the
:obj:`is_decoder` argument of the configuration set to :obj:`True`.
To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
argument and :obj:`add_cross_attention` set to :obj:`True`; an
:obj:`encoder_hidden_states` is then expected as an input to the forward pass.
"""
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.config = config
self.embeddings = XxxEmbeddings(config) self.embeddings = {{cookiecutter.camelcase_modelname}}Embeddings(config)
self.encoder = XxxEncoder(config) self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config)
self.pooler = XxxPooler(config)
self.init_weights() self.init_weights()
def get_input_embeddings(self): def get_input_embeddings(self):
return self.embeddings.word_embeddings return self.embeddings.word_embeddings
def set_input_embeddings(self, new_embeddings): def set_input_embeddings(self, value):
self.embeddings.word_embeddings = new_embeddings self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
"""Prunes heads of the model. """Prunes heads of the model.
...@@ -309,11 +660,11 @@ class XxxModel(XxxPreTrainedModel): ...@@ -309,11 +660,11 @@ class XxxModel(XxxPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=BaseModelOutputWithPooling, output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
def forward( def forward(
...@@ -324,10 +675,24 @@ class XxxModel(XxxPreTrainedModel): ...@@ -324,10 +675,24 @@ class XxxModel(XxxPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None, return_dict=None,
): ):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask
is used in the cross-attention if the model is configured as a decoder.
Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
...@@ -350,7 +715,21 @@ class XxxModel(XxxPreTrainedModel): ...@@ -350,7 +715,21 @@ class XxxModel(XxxPreTrainedModel):
if token_type_ids is None: if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
# If a 2D or 3D attention mask is provided for the cross-attention
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N # attention_probs has shape bsz x n_heads x N x N
...@@ -358,43 +737,54 @@ class XxxModel(XxxPreTrainedModel): ...@@ -358,43 +737,54 @@ class XxxModel(XxxPreTrainedModel):
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
##################################
# Replace this with your model code
embedding_output = self.embeddings( embedding_output = self.embeddings(
input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
) )
encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
if not return_dict: if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:] return (sequence_output,) + encoder_outputs[1:]
return BaseModelOutputWithPooling( return BaseModelOutput(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states, hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions, attentions=encoder_outputs.attentions,
) )
@add_start_docstrings("""XXX Model with a `language modeling` head on top. """, XXX_START_DOCSTRING) @add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING)
class XxxForMaskedLM(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}ForMaskedLM({{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = XxxModel(config) if config.is_decoder:
self.lm_head = nn.Linear(config.n_embd, config.vocab_size) logger.warning(
"If you want to use `{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for "
"bi-directional self-attention."
)
self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config)
self.init_weights() self.init_weights()
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_head return self.cls.predictions.decoder
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=MaskedLMOutput, output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -406,6 +796,8 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -406,6 +796,8 @@ class XxxForMaskedLM(XxxPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
...@@ -416,24 +808,26 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -416,24 +808,26 @@ class XxxForMaskedLM(XxxPreTrainedModel):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``.
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.cls(sequence_output)
masked_lm_loss = None masked_lm_loss = None
if labels is not None: if labels is not None:
...@@ -441,7 +835,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -441,7 +835,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict: if not return_dict:
output = (prediction_scores,) + outputs[2:] output = (prediction_scores,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput( return MaskedLMOutput(
...@@ -451,27 +845,60 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -451,27 +845,60 @@ class XxxForMaskedLM(XxxPreTrainedModel):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
input_shape = input_ids.shape
effective_batch_size = input_shape[0]
# add a dummy token
assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
dummy_token = torch.full(
(effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
)
input_ids = torch.cat([input_ids, dummy_token], dim=1)
return {"input_ids": input_ids, "attention_mask": attention_mask}
class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
self.config = config
def forward(self, features, **kwargs):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = ACT2FN[self.config.hidden_act](x)
x = self.dropout(x)
x = self.out_proj(x)
return x
@add_start_docstrings( @add_start_docstrings(
"""XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class XxxForSequenceClassification(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
self.transformer = XxxModel(config) self.classifier = {{cookiecutter.camelcase_modelname}}ClassificationHead(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.init_weights() self.init_weights()
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=SequenceClassifierOutput, output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -497,7 +924,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -497,7 +924,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -509,10 +936,8 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -509,10 +936,8 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
return_dict=return_dict, return_dict=return_dict,
) )
pooled_output = outputs[1] sequence_output = outputs[0]
logits = self.classifier(sequence_output)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None loss = None
if labels is not None: if labels is not None:
...@@ -525,7 +950,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -525,7 +950,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput( return SequenceClassifierOutput(
...@@ -535,26 +960,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -535,26 +960,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
attentions=outputs.attentions, attentions=outputs.attentions,
) )
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a multiple choice classification head on top (a linear layer on top of """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class XxxForMultipleChoice(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}ForMultipleChoice({{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.transformer = XxxModel(config) self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.sequence_summary = SequenceSummary(config)
self.classifier = nn.Linear(config.hidden_size, 1) self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights() self.init_weights()
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=MultipleChoiceModelOutput, output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -590,7 +1014,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -590,7 +1014,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
else None else None
) )
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -602,9 +1026,9 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -602,9 +1026,9 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
return_dict=return_dict, return_dict=return_dict,
) )
pooled_output = outputs[1] sequence_output = outputs[0]
pooled_output = self.dropout(pooled_output) pooled_output = self.sequence_summary(sequence_output)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices) reshaped_logits = logits.view(-1, num_choices)
...@@ -614,7 +1038,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -614,7 +1038,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
if not return_dict: if not return_dict:
output = (reshaped_logits,) + outputs[2:] output = (reshaped_logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput( return MultipleChoiceModelOutput(
...@@ -626,25 +1050,25 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -626,25 +1050,25 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a token classification head on top (a linear layer on top of """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class XxxForTokenClassification(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}ForTokenClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = XxxModel(config) self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=TokenClassifierOutput, output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -668,7 +1092,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -668,7 +1092,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -700,7 +1124,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -700,7 +1124,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput( return TokenClassifierOutput(
...@@ -712,24 +1136,26 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -712,24 +1136,26 @@ class XxxForTokenClassification(XxxPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"""XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
XXX_START_DOCSTRING, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING,
) )
class XxxForQuestionAnswering(XxxPreTrainedModel): class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.camelcase_modelname}}PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels self.num_labels = config.num_labels
self.transformer = XxxModel(config) self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights() self.init_weights()
@add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-uncased", checkpoint="{{cookiecutter.checkpoint_identifier}}",
output_type=QuestionAnsweringModelOutput, output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
...@@ -759,7 +1185,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -759,7 +1185,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.{{cookiecutter.lowercase_modelname}}(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
...@@ -796,7 +1222,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -796,7 +1222,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if not return_dict: if not return_dict:
output = (start_logits, end_logits) + outputs[2:] output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput( return QuestionAnsweringModelOutput(
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import {{cookiecutter.camelcase_modelname}}Config, is_tf_available
from transformers.testing_utils import require_tf, slow
from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_{{cookiecutter.lowercase_modelname}} import (
TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
TF{{cookiecutter.camelcase_modelname}}Model,
)
class TF{{cookiecutter.camelcase_modelname}}ModelTester:
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = 13
self.seq_length = 7
self.is_training = True
self.use_input_mask = True
self.use_token_type_ids = True
self.use_labels = True
self.vocab_size = 99
self.hidden_size = 32
self.num_hidden_layers = 5
self.num_attention_heads = 4
self.intermediate_size = 37
self.hidden_act = "gelu"
self.hidden_dropout_prob = 0.1
self.attention_probs_dropout_prob = 0.1
self.max_position_embeddings = 512
self.type_vocab_size = 16
self.type_sequence_label_size = 2
self.initializer_range = 0.02
self.num_labels = 3
self.num_choices = 4
self.scope = None
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = {{cookiecutter.camelcase_modelname}}Config(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
return_dict=True,
)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = TF{{cookiecutter.camelcase_modelname}}Model(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
inputs = [input_ids, input_mask]
result = model(inputs)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
result = model(inputs)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
result = model(inputs)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
inputs = {
"input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids,
}
result = model(inputs)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def create_and_check_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
result = model(inputs)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
result = model(inputs)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_tf
class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
all_model_classes = (
(
TF{{cookiecutter.camelcase_modelname}}Model,
TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
)
if is_tf_available()
else ()
)
def setUp(self):
self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained("{{cookiecutter.checkpoint_identifier}}")
self.assertIsNotNone(model)
# coding=utf-8 # coding=utf-8
# Copyright 2018 XXX Authors. # Copyright 2018 The Google AI Language Team Authors.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,37 +12,32 @@ ...@@ -12,37 +12,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Testing suite for the PyTorch {{cookiecutter.modelname}} model. """
import unittest import unittest
from transformers import is_torch_available from transformers import is_torch_available
from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, ids_tensor
from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
if is_torch_available(): if is_torch_available():
from transformers import ( from transformers import (
AutoModelForMaskedLM, {{cookiecutter.camelcase_modelname}}Config,
AutoTokenizer, {{cookiecutter.camelcase_modelname}}ForMaskedLM,
XxxConfig, {{cookiecutter.camelcase_modelname}}ForMultipleChoice,
XxxForMaskedLM, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
XxxForMultipleChoice, {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
XxxForQuestionAnswering, {{cookiecutter.camelcase_modelname}}ForTokenClassification,
XxxForSequenceClassification, {{cookiecutter.camelcase_modelname}}Model,
XxxForTokenClassification,
XxxModel,
) )
from transformers.file_utils import cached_property from transformers.modeling_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST
#
class XxxModelTester:
"""You can also import this e.g from .test_modeling_bart import BartModelTester """
class {{cookiecutter.camelcase_modelname}}ModelTester:
def __init__( def __init__(
self, self,
parent, parent,
...@@ -96,7 +91,7 @@ class XxxModelTester: ...@@ -96,7 +91,7 @@ class XxxModelTester:
input_mask = None input_mask = None
if self.use_input_mask: if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None token_type_ids = None
if self.use_token_type_ids: if self.use_token_type_ids:
...@@ -110,7 +105,7 @@ class XxxModelTester: ...@@ -110,7 +105,7 @@ class XxxModelTester:
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices) choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = XxxConfig( config = {{cookiecutter.camelcase_modelname}}Config(
vocab_size=self.vocab_size, vocab_size=self.vocab_size,
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers, num_hidden_layers=self.num_hidden_layers,
...@@ -121,6 +116,7 @@ class XxxModelTester: ...@@ -121,6 +116,7 @@ class XxxModelTester:
attention_probs_dropout_prob=self.attention_probs_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True, return_dict=True,
) )
...@@ -130,19 +126,18 @@ class XxxModelTester: ...@@ -130,19 +126,18 @@ class XxxModelTester:
def create_and_check_model( def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = XxxModel(config=config) model = {{cookiecutter.camelcase_modelname}}Model(config=config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids) result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
def create_and_check_for_masked_lm( def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = XxxForMaskedLM(config=config) model = {{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
...@@ -151,7 +146,7 @@ class XxxModelTester: ...@@ -151,7 +146,7 @@ class XxxModelTester:
def create_and_check_for_question_answering( def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = XxxForQuestionAnswering(config=config) model = {{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
result = model( result = model(
...@@ -168,7 +163,7 @@ class XxxModelTester: ...@@ -168,7 +163,7 @@ class XxxModelTester:
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = XxxForSequenceClassification(config) model = {{cookiecutter.camelcase_modelname}}ForSequenceClassification(config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
...@@ -178,7 +173,7 @@ class XxxModelTester: ...@@ -178,7 +173,7 @@ class XxxModelTester:
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = XxxForTokenClassification(config=config) model = {{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
...@@ -188,7 +183,7 @@ class XxxModelTester: ...@@ -188,7 +183,7 @@ class XxxModelTester:
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_choices = self.num_choices config.num_choices = self.num_choices
model = XxxForMultipleChoice(config=config) model = {{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config)
model.to(torch_device) model.to(torch_device)
model.eval() model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
...@@ -218,17 +213,24 @@ class XxxModelTester: ...@@ -218,17 +213,24 @@ class XxxModelTester:
@require_torch @require_torch
class XxxModelTest(ModelTesterMixin, unittest.TestCase): class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = ( all_model_classes = (
(XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification) (
{{cookiecutter.camelcase_modelname}}Model,
{{cookiecutter.camelcase_modelname}}ForMaskedLM,
{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
{{cookiecutter.camelcase_modelname}}ForTokenClassification,
)
if is_torch_available() if is_torch_available()
else () else ()
) )
def setUp(self): def setUp(self):
self.model_tester = XxxModelTester(self) self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37) self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37)
def test_config(self): def test_config(self):
self.config_tester.run_common_tests() self.config_tester.run_common_tests()
...@@ -241,6 +243,10 @@ class XxxModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -241,6 +243,10 @@ class XxxModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_question_answering(self): def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs) self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
...@@ -253,55 +259,10 @@ class XxxModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -253,55 +259,10 @@ class XxxModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs) self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs)
@slow @slow
def test_lm_outputs_same_as_reference_model(self): def test_model_from_pretrained(self):
"""Write something that could help someone fixing this here.""" for model_name in {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
checkpoint_path = "XXX/bart-large" model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained(model_name)
model = self.big_model self.assertIsNotNone(model)
tokenizer = AutoTokenizer.from_pretrained(
checkpoint_path
) # same with AutoTokenizer (see tokenization_auto.py). This is not mandatory
# MODIFY THIS DEPENDING ON YOUR MODELS RELEVANT TASK.
batch = tokenizer(["I went to the <mask> yesterday"]).to(torch_device)
desired_mask_result = tokenizer.decode("store") # update this
logits = model(**batch).logits
masked_index = (batch.input_ids == self.tokenizer.mask_token_id).nonzero()
assert model.num_parameters() == 175e9 # a joke
mask_entry_logits = logits[0, masked_index.item(), :]
probs = mask_entry_logits.softmax(dim=0)
_, predictions = probs.topk(1)
self.assertEqual(tokenizer.decode(predictions), desired_mask_result)
@cached_property
def big_model(self):
"""Cached property means this code will only be executed once."""
checkpoint_path = "XXX/bart-large"
model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to(
torch_device
) # test whether AutoModel can determine your model_class from checkpoint name
if torch_device == "cuda":
model.half()
# optional: do more testing! This will save you time later!
@slow
def test_that_XXX_can_be_used_in_a_pipeline(self):
"""We can use self.big_model here without calling __init__ again."""
pass
def test_XXX_loss_doesnt_change_if_you_add_padding(self):
pass
def test_XXX_bad_args(self):
pass
def test_XXX_backward_pass_reduces_loss(self):
"""Test loss/gradients same as reference implementation, for example."""
pass
@require_torch_gpu
def test_large_inputs_in_fp16_dont_cause_overflow(self):
pass
## This file is made so that specific statements may be copied inside existing files. This is useful to copy
## import statements in __init__.py, or to complete model lists in the AUTO files.
##
## It is to be used as such:
## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH
## Put '# Replace with:' followed by the lines containing the content to define the content
## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
## content in that file.
##
## Put '## COMMENT' to comment on the file.
# To replace in: "src/transformers/__init__.py"
# Below: "if is_torch_available():" if generating PyTorch
# Replace with:
from .modeling_{{cookiecutter.lowercase_modelname}} import (
{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
{{cookiecutter.camelcase_modelname}}ForMaskedLM,
{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
{{cookiecutter.camelcase_modelname}}ForTokenClassification,
{{cookiecutter.camelcase_modelname}}Layer,
{{cookiecutter.camelcase_modelname}}Model,
{{cookiecutter.camelcase_modelname}}PreTrainedModel,
load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
)
# End.
# Below: "if is_tf_available():" if generating TensorFlow
# Replace with:
from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
TF{{cookiecutter.camelcase_modelname}}Layer,
TF{{cookiecutter.camelcase_modelname}}Model,
TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
)
# End.
# Below: "from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
# Replace with:
from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, {{cookiecutter.camelcase_modelname}}Config
# End.
# To replace in: "src/transformers/configuration_auto.py"
# Below: "# Add configs here"
# Replace with:
("{{cookiecutter.lowercase_modelname}}", {{cookiecutter.camelcase_modelname}}Config),
# End.
# Below: "# Add archive maps here"
# Replace with:
{{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP,
# End.
# Below: "from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
# Replace with:
from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, {{cookiecutter.camelcase_modelname}}Config
# End.
# Below: "# Add full (and cased) model names here"
# Replace with:
("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}"),
# End.
# To replace in: "src/transformers/modeling_auto.py" if generating PyTorch
# Below: "from .configuration_auto import ("
# Replace with:
{{cookiecutter.camelcase_modelname}}Config,
# End.
# Below: "# Add modeling imports here"
# Replace with:
from .modeling_{{cookiecutter.lowercase_modelname}} import (
{{cookiecutter.camelcase_modelname}}ForMaskedLM,
{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
{{cookiecutter.camelcase_modelname}}ForTokenClassification,
{{cookiecutter.camelcase_modelname}}Model,
)
# End.
# Below: "# Base model mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Model),
# End.
# Below: "# Model with LM heads mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM),
# End.
# Below: "# Model for Masked LM mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM),
# End.
# Below: "# Model for Sequence Classification mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForSequenceClassification),
# End.
# Below: "# Model for Question Answering mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering),
# End.
# Below: "# Model for Token Classification mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForTokenClassification),
# End.
# Below: "# Model for Multiple Choice mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMultipleChoice),
# End.
# To replace in: "src/transformers/modeling_tf_auto.py" if generating TensorFlow
# Below: "from .configuration_auto import ("
# Replace with:
{{cookiecutter.camelcase_modelname}}Config,
# End.
# Below: "# Add modeling imports here"
# Replace with:
from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice,
TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification,
TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
TF{{cookiecutter.camelcase_modelname}}Model,
)
# End.
# Below: "# Base model mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}Model),
# End.
# Below: "# Model with LM heads mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM),
# End.
# Below: "# Model for Masked LM mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM),
# End.
# Below: "# Model for Sequence Classification mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification),
# End.
# Below: "# Model for Question Answering mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering),
# End.
# Below: "# Model for Token Classification mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForTokenClassification),
# End.
# Below: "# Model for Multiple Choice mapping"
# Replace with:
({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice),
# End.
# coding=utf-8 # coding=utf-8
# Copyright 2018 XXX Authors. # Copyright 2018 The HuggingFace Inc. team.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,181 +12,146 @@ ...@@ -12,181 +12,146 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Tokenization class for model XXX.""" """Tokenization classes for {{cookiecutter.modelname}}."""
{%- if cookiecutter.tokenizer_type == "Based on BERT" %}
from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .utils import logging
import collections
import logging
import os
from typing import List, Optional, Tuple
from .tokenization_utils import PreTrainedTokenizer logger = logging.get_logger(__name__)
logger = logging.getLogger(__name__)
####################################################
# In this template, replace all the XXX (various casings) with your model name
####################################################
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": { "vocab_file": {
"xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt", "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
"xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
} }
} }
####################################################
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"xxx-base-uncased": 512, "{{cookiecutter.checkpoint_identifier}}": 512,
"xxx-large-uncased": 512,
} }
####################################################
# Mapping from model shortcut names to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = { PRETRAINED_INIT_CONFIGURATION = {
"xxx-base-uncased": {"do_lower_case": True}, "{{cookiecutter.checkpoint_identifier}}": {"do_lower_case": False},
"xxx-large-uncased": {"do_lower_case": True},
} }
def load_vocab(vocab_file): class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
"""Loads a vocabulary file into a dictionary.""" r"""
vocab = collections.OrderedDict() Construct a {{cookiecutter.modelname}} tokenizer.
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines() :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
for index, token in enumerate(tokens): tokenization: punctuation splitting and wordpiece.
token = token.rstrip("\n")
vocab[token] = index Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
return vocab parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
class XxxTokenizer(PreTrainedTokenizer): class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
r""" r"""
Constructs a XXX tokenizer. Based on XXX. Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
{%- elif cookiecutter.tokenizer_type == "Standalone" %}
import warnings
from tokenizers import ByteLevelBPETokenizer
from .tokenization_utils import AddedToken, PreTrainedTokenizer
from .tokenization_utils_base import BatchEncoding
from .tokenization_utils_fast import PreTrainedTokenizerFast
from typing import List, Optional
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {}
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. PRETRAINED_VOCAB_FILES_MAP = {}
Users should refer to this superclass for more information regarding those methods.
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"{{cookiecutter.checkpoint_identifier}}": 1024,
}
class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
"""
Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding.
Args: Args:
vocab_file (:obj:`str`): vocab_file (:obj:`str`):
File containing the vocabulary. Path to the vocabulary file.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether ot not to do basic tokenization before WordPiece.
never_split (:obj:`Iterable`, `optional`):
Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__( def __init__(
self, self,
vocab_file, vocab_file,
do_lower_case=True, unk_token="<|endoftext|>",
do_basic_tokenize=True, bos_token="<|endoftext|>",
never_split=None, eos_token="<|endoftext|>",
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
**kwargs **kwargs
): ):
super().__init__( bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
unk_token=unk_token, eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token=sep_token, unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token=pad_token, super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
if not os.path.isfile(vocab_file): "Initialisation"
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
# Replace and adapt
# if do_basic_tokenize:
# self.basic_tokenizer = BasicTokenizer(
# do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
# )
# self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.vocab) "Returns vocab size"
def get_vocab(self): def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder) "Returns vocab as a dict"
def _tokenize(self, text): def _tokenize(self, text):
split_tokens = [] """ Returns a tokenized string. """
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
# If the token is part of the never_split set
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """ """ Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """ """ Converts a sequence of tokens (string) in a single string. """
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string def save_vocabulary(self, save_directory):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
...@@ -194,10 +159,10 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -194,10 +159,10 @@ class XxxTokenizer(PreTrainedTokenizer):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A XXX sequence has the following format: A {{cookiecutter.modelname}} sequence has the following format:
- single sequence: ``[CLS] X [SEP]`` - single sequence: ``<s> X </s>``
- pair of sequences: ``[CLS] A [SEP] B [SEP]`` - pair of sequences: ``<s> A </s></s> B </s>``
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (:obj:`List[int]`):
...@@ -212,7 +177,7 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -212,7 +177,7 @@ class XxxTokenizer(PreTrainedTokenizer):
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
...@@ -232,7 +197,6 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -232,7 +197,6 @@ class XxxTokenizer(PreTrainedTokenizer):
Returns: Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
raise ValueError( raise ValueError(
...@@ -241,23 +205,122 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -241,23 +205,122 @@ class XxxTokenizer(PreTrainedTokenizer):
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
""" """
Create a mask from the two sequences passed to be used in a sequence-pair classification task. Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
:: Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
return (text, kwargs)
class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
trim_offsets=True,
**kwargs
):
super().__init__(
ByteLevelBPETokenizer(
vocab_file=vocab_file,
merges_file=merges_file,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
),
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
**kwargs,
)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = None
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = None
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
return super()._encode_plus(*args, **kwargs)
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return output
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
{{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
Args: Args:
token_ids_0 (:obj:`List[int]`): token_ids_0 (:obj:`List[int]`):
...@@ -266,31 +329,14 @@ class XxxTokenizer(PreTrainedTokenizer): ...@@ -266,31 +329,14 @@ class XxxTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs. Optional second list of IDs for sequence pairs.
Returns: Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given :obj:`List[int]`: List of zeros.
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
if token_ids_1 is None: if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
index = 0 {% endif %}
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file)
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
{{cookiecutter.uppercase_modelname}}
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The {{cookiecutter.modelname}} model was proposed in `<INSERT PAPER NAME HERE>
<<INSERT PAPER LINK HERE>>`__ by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
The abstract from the paper is the following:
*<INSERT PAPER ABSTRACT HERE>*
Tips:
<INSERT TIPS ABOUT MODEL HERE>
{{cookiecutter.camelcase_modelname}}Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Config
:members:
{{cookiecutter.camelcase_modelname}}Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Tokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
{{cookiecutter.camelcase_modelname}}TokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
{% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%}
{{cookiecutter.camelcase_modelname}}Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Model
:members: forward
{{cookiecutter.camelcase_modelname}}ForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMaskedLM
:members: forward
{{cookiecutter.camelcase_modelname}}ForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
:members: forward
{{cookiecutter.camelcase_modelname}}ForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
:members:
{{cookiecutter.camelcase_modelname}}ForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification
:members: forward
{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
:members: forward
{% endif -%}
{% if "TensorFlow" in cookiecutter.generate_tensorflow_and_pytorch -%}
TF{{cookiecutter.camelcase_modelname}}Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}Model
:members: call
TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
:members: call
TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
:members: call
TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice
:members: call
TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForTokenClassification
:members: call
TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
:members: call
{% endif -%}
{
"modelname": "BrandNewBERT",
"uppercase_modelname": "BRAND_NEW_BERT",
"lowercase_modelname": "brand_new_bert",
"camelcase_modelname": "BrandNewBert",
"authors": "The HuggingFace Team",
"checkpoint_identifier": "brand-new-bert-base-cased",
"tokenizer_type": ["Based on BERT", "Standalone"],
"generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"]
}
\ No newline at end of file
{
"modelname": "EncoderBERT",
"uppercase_modelname": "ENCODER_BERT",
"lowercase_modelname": "encoder_bert",
"camelcase_modelname": "EncoderBert",
"authors": "The HuggingFace Team",
"checkpoint_identifier": "brand-new-bert-base-cased",
"tokenizer_type": "Based on BERT",
"generate_tensorflow_and_pytorch": "PyTorch & TensorFlow"
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment