Add Multi Resolution Analysis (MRA) (New PR) (#24513)

* Add all files * Update masked_language_modeling.md * fix mlm models * fix conflicts * fix conflicts * fix copies * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Reduce seq_len and hidden_size in ModelTester * remove output_attentions * fix conflicts * remove copied from statements * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Add Multi Resolution Analysis (MRA) (New PR) (#24513)
* Add all files * Update masked_language_modeling.md * fix mlm models * fix conflicts * fix conflicts * fix copies * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Reduce seq_len and hidden_size in ModelTester * remove output_attentions * fix conflicts * remove copied from statements * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
30ed3adf · novice · GitHub · abaca9f9 · 30ed3adf · 30ed3adf
Unverified Commit 30ed3adf authored Jul 10, 2023 by novice Committed by GitHub Jul 10, 2023
12 changed files
--- a/src/transformers/kernels/mra/torch_extension.cpp
+++ b/src/transformers/kernels/mra/torch_extension.cpp
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include <vector>
+
+std::vector<at::Tensor> index_max(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return index_max_kernel(
+    index_vals,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor mm_to_sparse(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+) {
+  return mm_to_sparse_kernel(
+    dense_A,
+    dense_B,
+    indices
+  );
+}
+
+at::Tensor sparse_dense_mm(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+) {
+  return sparse_dense_mm_kernel(
+    sparse_A,
+    indices,
+    dense_B,
+    A_num_block
+  );
+}
+
+at::Tensor reduce_sum(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return reduce_sum_kernel(
+    sparse_A,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor scatter(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+) {
+  return scatter_kernel(
+    dense_A,
+    indices,
+    B_num_block
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("index_max", &index_max, "index_max (CUDA)");
+  m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
+  m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
+  m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
+  m.def("scatter", &scatter, "scatter (CUDA)");
+}
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -134,6 +134,7 @@ from . import (
    mobilevit,
    mobilevitv2,
    mpnet,
+    mra,
    mt5,
    musicgen,
    mvp,

--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -137,6 +137,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("mobilevit", "MobileViTConfig"),
        ("mobilevitv2", "MobileViTV2Config"),
        ("mpnet", "MPNetConfig"),
+        ("mra", "MraConfig"),
        ("mt5", "MT5Config"),
        ("musicgen", "MusicgenConfig"),
        ("mvp", "MvpConfig"),
@@ -329,6 +330,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
        ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -534,6 +536,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("mobilevit", "MobileViT"),
        ("mobilevitv2", "MobileViTV2"),
        ("mpnet", "MPNet"),
+        ("mra", "MRA"),
        ("mt5", "MT5"),
        ("musicgen", "MusicGen"),
        ("mvp", "MVP"),

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -134,6 +134,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("mobilevit", "MobileViTModel"),
        ("mobilevitv2", "MobileViTV2Model"),
        ("mpnet", "MPNetModel"),
+        ("mra", "MraModel"),
        ("mt5", "MT5Model"),
        ("mvp", "MvpModel"),
        ("nat", "NatModel"),
@@ -247,6 +248,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForPreTraining"),
        ("mobilebert", "MobileBertForPreTraining"),
        ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
        ("mvp", "MvpForConditionalGeneration"),
        ("nezha", "NezhaForPreTraining"),
        ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -326,6 +328,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForCausalLM"),
        ("mobilebert", "MobileBertForMaskedLM"),
        ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
        ("mvp", "MvpForConditionalGeneration"),
        ("nezha", "NezhaForMaskedLM"),
        ("nllb-moe", "NllbMoeForConditionalGeneration"),
@@ -572,6 +575,7 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForMaskedLM"),
        ("mobilebert", "MobileBertForMaskedLM"),
        ("mpnet", "MPNetForMaskedLM"),
+        ("mra", "MraForMaskedLM"),
        ("mvp", "MvpForConditionalGeneration"),
        ("nezha", "NezhaForMaskedLM"),
        ("nystromformer", "NystromformerForMaskedLM"),
@@ -704,6 +708,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForSequenceClassification"),
        ("mobilebert", "MobileBertForSequenceClassification"),
        ("mpnet", "MPNetForSequenceClassification"),
+        ("mra", "MraForSequenceClassification"),
        ("mvp", "MvpForSequenceClassification"),
        ("nezha", "NezhaForSequenceClassification"),
        ("nystromformer", "NystromformerForSequenceClassification"),
@@ -771,6 +776,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForQuestionAnswering"),
        ("mobilebert", "MobileBertForQuestionAnswering"),
        ("mpnet", "MPNetForQuestionAnswering"),
+        ("mra", "MraForQuestionAnswering"),
        ("mt5", "MT5ForQuestionAnswering"),
        ("mvp", "MvpForQuestionAnswering"),
        ("nezha", "NezhaForQuestionAnswering"),
@@ -856,6 +862,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForTokenClassification"),
        ("mobilebert", "MobileBertForTokenClassification"),
        ("mpnet", "MPNetForTokenClassification"),
+        ("mra", "MraForTokenClassification"),
        ("nezha", "NezhaForTokenClassification"),
        ("nystromformer", "NystromformerForTokenClassification"),
        ("qdqbert", "QDQBertForTokenClassification"),
@@ -899,6 +906,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
        ("megatron-bert", "MegatronBertForMultipleChoice"),
        ("mobilebert", "MobileBertForMultipleChoice"),
        ("mpnet", "MPNetForMultipleChoice"),
+        ("mra", "MraForMultipleChoice"),
        ("nezha", "NezhaForMultipleChoice"),
        ("nystromformer", "NystromformerForMultipleChoice"),
        ("qdqbert", "QDQBertForMultipleChoice"),

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -214,6 +214,7 @@ else:
            ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
            (
                "mt5",
                (

--- a/src/transformers/models/mra/__init__.py
+++ b/src/transformers/models/mra/__init__.py
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mra"] = [
+        "MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MraForMaskedLM",
+        "MraForMultipleChoice",
+        "MraForQuestionAnswering",
+        "MraForSequenceClassification",
+        "MraForTokenClassification",
+        "MraLayer",
+        "MraModel",
+        "MraPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mra import (
+            MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+            MraLayer,
+            MraModel,
+            MraPreTrainedModel,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
--- a/src/transformers/models/mra/configuration_mra.py
+++ b/src/transformers/models/mra/configuration_mra.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MRA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
+}
+
+
+class MraConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mra
+    [uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Mra model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MraModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 1):
+            The vocabulary size of the `token_type_ids` passed when calling [`MraModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`.
+        block_per_row (`int`, *optional*, defaults to 4):
+            Used to set the budget for the high resolution scale.
+        approx_mode (`str`, *optional*, defaults to `"full"`):
+            Controls whether both low and high resolution approximations are used. Set to `"full"` for both low and
+            high resolution and `"sparse"` for only low resolution.
+        initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
+            The initial number of blocks for which high resolution is used.
+        initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
+            The number of diagonal blocks for which high resolution is used.
+
+    Example:
+
+    ```python
+    >>> from transformers import MraConfig, MraModel
+
+    >>> # Initializing a Mra uw-madison/mra-base-512-4 style configuration
+    >>> configuration = MraConfig()
+
+    >>> # Initializing a model (with random weights) from the uw-madison/mra-base-512-4 style configuration
+    >>> model = MraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mra"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        position_embedding_type="absolute",
+        block_per_row=4,
+        approx_mode="full",
+        initial_prior_first_n_blocks=0,
+        initial_prior_diagonal_n_blocks=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.block_per_row = block_per_row
+        self.approx_mode = approx_mode
+        self.initial_prior_first_n_blocks = initial_prior_first_n_blocks
+        self.initial_prior_diagonal_n_blocks = initial_prior_diagonal_n_blocks
--- a/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+++ b/src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
+
+import argparse
+
+import torch
+
+from transformers import MraConfig, MraForMaskedLM
+
+
+def rename_key(orig_key):
+    if "model" in orig_key:
+        orig_key = orig_key.replace("model.", "")
+    if "norm1" in orig_key:
+        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
+    if "norm2" in orig_key:
+        orig_key = orig_key.replace("norm2", "output.LayerNorm")
+    if "norm" in orig_key:
+        orig_key = orig_key.replace("norm", "LayerNorm")
+    if "transformer" in orig_key:
+        layer_num = orig_key.split(".")[0].split("_")[-1]
+        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
+    if "mha.attn" in orig_key:
+        orig_key = orig_key.replace("mha.attn", "attention.self")
+    if "mha" in orig_key:
+        orig_key = orig_key.replace("mha", "attention")
+    if "W_q" in orig_key:
+        orig_key = orig_key.replace("W_q", "self.query")
+    if "W_k" in orig_key:
+        orig_key = orig_key.replace("W_k", "self.key")
+    if "W_v" in orig_key:
+        orig_key = orig_key.replace("W_v", "self.value")
+    if "ff.0" in orig_key:
+        orig_key = orig_key.replace("ff.0", "intermediate.dense")
+    if "ff.2" in orig_key:
+        orig_key = orig_key.replace("ff.2", "output.dense")
+    if "ff" in orig_key:
+        orig_key = orig_key.replace("ff", "output.dense")
+    if "mlm_class" in orig_key:
+        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
+    if "mlm" in orig_key:
+        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
+    if "backbone.backbone.encoders" in orig_key:
+        orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
+    if "cls" not in orig_key:
+        orig_key = "mra." + orig_key
+
+    return orig_key
+
+
+def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if ("pooler" in key) or ("sen_class" in key):
+            continue
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
+    orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
+
+    return orig_state_dict
+
+
+def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
+    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
+    config = MraConfig.from_json_file(mra_config_file)
+    model = MraForMaskedLM(config)
+
+    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
+
+    print(model.load_state_dict(new_state_dict))
+    model.eval()
+    model.save_pretrained(pytorch_dump_path)
+
+    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The json file for Mra model config.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
--- a/src/transformers/models/mra/modeling_mra.py
+++ b/src/transformers/models/mra/modeling_mra.py
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4901,6 +4901,58 @@ class MPNetPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])


+MRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MraForMaskedLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForMultipleChoice(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MraPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MT5EncoderModel(metaclass=DummyObject):
    _backends = ["torch"]


--- a/tests/models/mra/__init__.py
+++ b/tests/models/mra/__init__.py
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MRA model. """
+
+
+import unittest
+
+from transformers import MraConfig, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MraForMaskedLM,
+        MraForMultipleChoice,
+        MraForQuestionAnswering,
+        MraForSequenceClassification,
+        MraForTokenClassification,
+        MraModel,
+    )
+    from transformers.models.mra.modeling_mra import MRA_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class MraModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=8,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=5,
+        num_attention_heads=2,
+        intermediate_size=36,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MraConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.vocab_size = 300
+        return config
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MraModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MraForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MraForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MraForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MraModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            MraModel,
+            MraForMaskedLM,
+            MraForMultipleChoice,
+            MraForQuestionAnswering,
+            MraForSequenceClassification,
+            MraForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    test_pruning = False
+    test_headmasking = False
+    test_torchscript = False
+    has_attentions = False
+
+    all_generative_model_classes = ()
+
+    def setUp(self):
+        self.model_tester = MraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in MRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = MraModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    @unittest.skip(reason="MRA does not output attentions")
+    def test_attention_outputs(self):
+        return
+
+
+@require_torch
+class MraModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        model = MraModel.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        expected_shape = torch.Size((1, 256, 768))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0140, 0.0830, -0.0381], [0.1546, 0.1402, 0.0220], [0.1162, 0.0851, 0.0165]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-512-4")
+        input_ids = torch.arange(256).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 256, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[9.2595, -3.6038, 11.8819], [9.3869, -3.2693, 11.0956], [11.8524, -3.4938, 13.1210]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_masked_lm_long_input(self):
+        model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-4096-8-d3")
+        input_ids = torch.arange(4096).unsqueeze(0)
+
+        with torch.no_grad():
+            output = model(input_ids)[0]
+
+        vocab_size = 50265
+
+        expected_shape = torch.Size((1, 4096, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[5.4789, -2.3564, 7.5064], [7.9067, -1.3369, 9.9668], [9.0712, -1.8106, 7.0380]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))