Unverified Commit b4b613b1 authored by Andreas Madsen's avatar Andreas Madsen Committed by GitHub
Browse files

Implement Roberta PreLayerNorm (#20305)



* Copy RoBERTa

* formatting

* implement RoBERTa with prelayer normalization

* update test expectations

* add documentation

* add convertion script for DinkyTrain weights

* update checkpoint repo

Unfortunately the original checkpoints assumes a hacked roberta model

* add to RoBERTa-PreLayerNorm docs to toc

* run utils/check_copies.py

* lint files

* remove unused import

* fix check_repo reporting wrongly a test is missing

* fix import error, caused by rebase

* run make fix-copies

* add RobertaPreLayerNormConfig to ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS

* Fix documentation <Facebook> -> Facebook
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* fixup: Fix documentation <Facebook> -> Facebook
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Add missing Flax header
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* expected_slice -> EXPECTED_SLICE
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* update copies after rebase

* add missing copied from statements

* make fix-copies

* make prelayernorm explicit in code

* fix checkpoint path for the original implementation

* add flax integration tests

* improve docs

* update utils/documentation_tests.txt

* lint files

* Remove Copyright notice
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* make fix-copies

* Remove EXPECTED_SLICE calculation comments
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 7032e020
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert RoBERTa-PreLayerNorm checkpoint."""
import argparse
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
def convert_roberta_prelayernorm_checkpoint_to_pytorch(checkpoint_repo: str, pytorch_dump_folder_path: str):
"""
Copy/paste/tweak roberta_prelayernorm's weights to our BERT structure.
"""
# convert configuration
config = RobertaPreLayerNormConfig.from_pretrained(
checkpoint_repo, architectures=["RobertaPreLayerNormForMaskedLM"]
)
# convert state_dict
original_state_dict = torch.load(hf_hub_download(repo_id=checkpoint_repo, filename="pytorch_model.bin"))
state_dict = {}
for tensor_key, tensor_value in original_state_dict.items():
# The transformer implementation gives the model a unique name, rather than overwiriting 'roberta'
if tensor_key.startswith("roberta."):
tensor_key = "roberta_prelayernorm." + tensor_key[len("roberta.") :]
# The original implementation contains weights which are not used, remove them from the state_dict
if tensor_key.endswith(".self.LayerNorm.weight") or tensor_key.endswith(".self.LayerNorm.bias"):
continue
state_dict[tensor_key] = tensor_value
model = RobertaPreLayerNormForMaskedLM.from_pretrained(
pretrained_model_name_or_path=None, config=config, state_dict=state_dict
)
model.save_pretrained(pytorch_dump_folder_path)
# convert tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_repo)
tokenizer.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--checkpoint-repo",
default=None,
type=str,
required=True,
help="Path the official PyTorch dump, e.g. 'andreasmadsen/efficient_mlm_m0.40'.",
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_roberta_prelayernorm_checkpoint_to_pytorch(args.checkpoint_repo, args.pytorch_dump_folder_path)
......@@ -928,6 +928,62 @@ class FlaxRobertaPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForCausalLM(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForMaskedLM(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForMultipleChoice(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForQuestionAnswering(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForSequenceClassification(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormForTokenClassification(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormModel(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaPreLayerNormPreTrainedModel(metaclass=DummyObject):
_backends = ["flax"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRoFormerForMaskedLM(metaclass=DummyObject):
_backends = ["flax"]
......
......@@ -4854,6 +4854,65 @@ class RobertaPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = None
class RobertaPreLayerNormForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormForMultipleChoice(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormForQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class RobertaPreLayerNormPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
ROC_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
......@@ -2124,6 +2124,72 @@ class TFRobertaPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["tf"])
TF_ROBERTA_PRELAYERNORM_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TFRobertaPreLayerNormForCausalLM(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormForMaskedLM(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormForMultipleChoice(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormForQuestionAnswering(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormForSequenceClassification(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormForTokenClassification(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormMainLayer(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormModel(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
class TFRobertaPreLayerNormPreTrainedModel(metaclass=DummyObject):
_backends = ["tf"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from transformers import RobertaPreLayerNormConfig, is_flax_available
from transformers.testing_utils import require_flax, slow
from ...test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_flax_available():
import jax.numpy as jnp
from transformers.models.roberta_prelayernorm.modeling_flax_roberta_prelayernorm import (
FlaxRobertaPreLayerNormForCausalLM,
FlaxRobertaPreLayerNormForMaskedLM,
FlaxRobertaPreLayerNormForMultipleChoice,
FlaxRobertaPreLayerNormForQuestionAnswering,
FlaxRobertaPreLayerNormForSequenceClassification,
FlaxRobertaPreLayerNormForTokenClassification,
FlaxRobertaPreLayerNormModel,
)
# Copied from tests.models.roberta.test_modelling_flax_roberta.FlaxRobertaModelTester with Roberta->RobertaPreLayerNorm
class FlaxRobertaPreLayerNormModelTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_attention_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_choices=4,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_choices = num_choices
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
config = RobertaPreLayerNormConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
)
return config, input_ids, token_type_ids, attention_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, token_type_ids, attention_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
return config, inputs_dict
def prepare_config_and_inputs_for_decoder(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, token_type_ids, attention_mask = config_and_inputs
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return (
config,
input_ids,
token_type_ids,
encoder_hidden_states,
encoder_attention_mask,
)
@require_flax
# Copied from tests.models.roberta.test_modelling_flax_roberta.FlaxRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta-base->andreasmadsen/efficient_mlm_m0.40
class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase):
test_head_masking = True
all_model_classes = (
(
FlaxRobertaPreLayerNormModel,
FlaxRobertaPreLayerNormForCausalLM,
FlaxRobertaPreLayerNormForMaskedLM,
FlaxRobertaPreLayerNormForSequenceClassification,
FlaxRobertaPreLayerNormForTokenClassification,
FlaxRobertaPreLayerNormForMultipleChoice,
FlaxRobertaPreLayerNormForQuestionAnswering,
)
if is_flax_available()
else ()
)
def setUp(self):
self.model_tester = FlaxRobertaPreLayerNormModelTester(self)
@slow
def test_model_from_pretrained(self):
for model_class_name in self.all_model_classes:
model = model_class_name.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
outputs = model(np.ones((1, 1)))
self.assertIsNotNone(outputs)
@require_flax
class TFRobertaPreLayerNormModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_masked_lm(self):
model = FlaxRobertaPreLayerNormForMaskedLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
input_ids = np.array([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]], dtype=jnp.int32)
output = model(input_ids)[0]
expected_shape = [1, 11, 50265]
self.assertEqual(list(output.shape), expected_shape)
# compare the actual values for a slice.
EXPECTED_SLICE = np.array(
[[[40.4880, 18.0199, -5.2367], [-1.8877, -4.0885, 10.7085], [-2.2613, -5.6110, 7.2665]]], dtype=np.float32
)
self.assertTrue(np.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
@slow
def test_inference_no_head(self):
model = FlaxRobertaPreLayerNormModel.from_pretrained("andreasmadsen/efficient_mlm_m0.40", from_pt=True)
input_ids = np.array([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]], dtype=jnp.int32)
output = model(input_ids)[0]
# compare the actual values for a slice.
EXPECTED_SLICE = np.array(
[[[0.0208, -0.0356, 0.0237], [-0.1569, -0.0411, -0.2626], [0.1879, 0.0125, -0.0089]]], dtype=np.float32
)
self.assertTrue(np.allclose(output[:, :3, :3], EXPECTED_SLICE, atol=1e-4))
......@@ -74,6 +74,7 @@ ROBERTA_EMBEDDING_ADJUSMENT_CONFIGS = [
"LongformerConfig",
"MarkupLMConfig",
"RobertaConfig",
"RobertaPreLayerNormConfig",
"XLMRobertaConfig",
]
......
......@@ -112,6 +112,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"TFDPREncoder", # Building part of bigger (tested) model.
"TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
"TFRobertaForMultipleChoice", # TODO: fix
"TFRobertaPreLayerNormForMultipleChoice", # TODO: fix
"TrOCRDecoderWrapper", # Building part of bigger (tested) model.
"TFWhisperEncoder", # Building part of bigger (tested) model.
"TFWhisperDecoder", # Building part of bigger (tested) model.
......
......@@ -146,6 +146,9 @@ src/transformers/models/resnet/modeling_tf_resnet.py
src/transformers/models/roberta/configuration_roberta.py
src/transformers/models/roberta/modeling_roberta.py
src/transformers/models/roberta/modeling_tf_roberta.py
src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
src/transformers/models/roc_bert/modeling_roc_bert.py
src/transformers/models/roc_bert/tokenization_roc_bert.py
src/transformers/models/segformer/modeling_segformer.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment