Unverified Commit 30ed3adf authored by novice's avatar novice Committed by GitHub
Browse files

Add Multi Resolution Analysis (MRA) (New PR) (#24513)



* Add all files

* Update masked_language_modeling.md

* fix mlm models

* fix conflicts

* fix conflicts

* fix copies

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Reduce seq_len and hidden_size in ModelTester

* remove output_attentions

* fix conflicts

* remove copied from statements

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent abaca9f9
#include <torch/extension.h>
#include <ATen/ATen.h>
#include "cuda_launch.h"
#include <vector>
std::vector<at::Tensor> index_max(
at::Tensor index_vals,
at::Tensor indices,
int A_num_block,
int B_num_block
) {
return index_max_kernel(
index_vals,
indices,
A_num_block,
B_num_block
);
}
at::Tensor mm_to_sparse(
at::Tensor dense_A,
at::Tensor dense_B,
at::Tensor indices
) {
return mm_to_sparse_kernel(
dense_A,
dense_B,
indices
);
}
at::Tensor sparse_dense_mm(
at::Tensor sparse_A,
at::Tensor indices,
at::Tensor dense_B,
int A_num_block
) {
return sparse_dense_mm_kernel(
sparse_A,
indices,
dense_B,
A_num_block
);
}
at::Tensor reduce_sum(
at::Tensor sparse_A,
at::Tensor indices,
int A_num_block,
int B_num_block
) {
return reduce_sum_kernel(
sparse_A,
indices,
A_num_block,
B_num_block
);
}
at::Tensor scatter(
at::Tensor dense_A,
at::Tensor indices,
int B_num_block
) {
return scatter_kernel(
dense_A,
indices,
B_num_block
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("index_max", &index_max, "index_max (CUDA)");
m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
m.def("scatter", &scatter, "scatter (CUDA)");
}
......@@ -134,6 +134,7 @@ from . import (
mobilevit,
mobilevitv2,
mpnet,
mra,
mt5,
musicgen,
mvp,
......
......@@ -137,6 +137,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
("mobilevit", "MobileViTConfig"),
("mobilevitv2", "MobileViTV2Config"),
("mpnet", "MPNetConfig"),
("mra", "MraConfig"),
("mt5", "MT5Config"),
("musicgen", "MusicgenConfig"),
("mvp", "MvpConfig"),
......@@ -329,6 +330,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mobilevitv2", "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mra", "MRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("musicgen", "MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("mvp", "MVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
("nat", "NAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
......@@ -534,6 +536,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
("mobilevit", "MobileViT"),
("mobilevitv2", "MobileViTV2"),
("mpnet", "MPNet"),
("mra", "MRA"),
("mt5", "MT5"),
("musicgen", "MusicGen"),
("mvp", "MVP"),
......
......@@ -134,6 +134,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("mobilevit", "MobileViTModel"),
("mobilevitv2", "MobileViTV2Model"),
("mpnet", "MPNetModel"),
("mra", "MraModel"),
("mt5", "MT5Model"),
("mvp", "MvpModel"),
("nat", "NatModel"),
......@@ -247,6 +248,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForPreTraining"),
("mobilebert", "MobileBertForPreTraining"),
("mpnet", "MPNetForMaskedLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForPreTraining"),
("nllb-moe", "NllbMoeForConditionalGeneration"),
......@@ -326,6 +328,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForCausalLM"),
("mobilebert", "MobileBertForMaskedLM"),
("mpnet", "MPNetForMaskedLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForMaskedLM"),
("nllb-moe", "NllbMoeForConditionalGeneration"),
......@@ -572,6 +575,7 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForMaskedLM"),
("mobilebert", "MobileBertForMaskedLM"),
("mpnet", "MPNetForMaskedLM"),
("mra", "MraForMaskedLM"),
("mvp", "MvpForConditionalGeneration"),
("nezha", "NezhaForMaskedLM"),
("nystromformer", "NystromformerForMaskedLM"),
......@@ -704,6 +708,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForSequenceClassification"),
("mobilebert", "MobileBertForSequenceClassification"),
("mpnet", "MPNetForSequenceClassification"),
("mra", "MraForSequenceClassification"),
("mvp", "MvpForSequenceClassification"),
("nezha", "NezhaForSequenceClassification"),
("nystromformer", "NystromformerForSequenceClassification"),
......@@ -771,6 +776,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForQuestionAnswering"),
("mobilebert", "MobileBertForQuestionAnswering"),
("mpnet", "MPNetForQuestionAnswering"),
("mra", "MraForQuestionAnswering"),
("mt5", "MT5ForQuestionAnswering"),
("mvp", "MvpForQuestionAnswering"),
("nezha", "NezhaForQuestionAnswering"),
......@@ -856,6 +862,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForTokenClassification"),
("mobilebert", "MobileBertForTokenClassification"),
("mpnet", "MPNetForTokenClassification"),
("mra", "MraForTokenClassification"),
("nezha", "NezhaForTokenClassification"),
("nystromformer", "NystromformerForTokenClassification"),
("qdqbert", "QDQBertForTokenClassification"),
......@@ -899,6 +906,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
("megatron-bert", "MegatronBertForMultipleChoice"),
("mobilebert", "MobileBertForMultipleChoice"),
("mpnet", "MPNetForMultipleChoice"),
("mra", "MraForMultipleChoice"),
("nezha", "NezhaForMultipleChoice"),
("nystromformer", "NystromformerForMultipleChoice"),
("qdqbert", "QDQBertForMultipleChoice"),
......
......@@ -214,6 +214,7 @@ else:
("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
(
"mt5",
(
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
# rely on isort to merge the imports
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
_import_structure = {"configuration_mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"]}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_mra"] = [
"MRA_PRETRAINED_MODEL_ARCHIVE_LIST",
"MraForMaskedLM",
"MraForMultipleChoice",
"MraForQuestionAnswering",
"MraForSequenceClassification",
"MraForTokenClassification",
"MraLayer",
"MraModel",
"MraPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_mra import (
MRA_PRETRAINED_MODEL_ARCHIVE_LIST,
MraForMaskedLM,
MraForMultipleChoice,
MraForQuestionAnswering,
MraForSequenceClassification,
MraForTokenClassification,
MraLayer,
MraModel,
MraPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MRA model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
MRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"uw-madison/mra-base-512-4": "https://huggingface.co/uw-madison/mra-base-512-4/resolve/main/config.json",
}
class MraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MraModel`]. It is used to instantiate an MRA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Mra
[uw-madison/mra-base-512-4](https://huggingface.co/uw-madison/mra-base-512-4) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the Mra model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MraModel`].
hidden_size (`int`, *optional*, defaults to 768):
Dimension of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 1):
The vocabulary size of the `token_type_ids` passed when calling [`MraModel`].
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`.
block_per_row (`int`, *optional*, defaults to 4):
Used to set the budget for the high resolution scale.
approx_mode (`str`, *optional*, defaults to `"full"`):
Controls whether both low and high resolution approximations are used. Set to `"full"` for both low and
high resolution and `"sparse"` for only low resolution.
initial_prior_first_n_blocks (`int`, *optional*, defaults to 0):
The initial number of blocks for which high resolution is used.
initial_prior_diagonal_n_blocks (`int`, *optional*, defaults to 0):
The number of diagonal blocks for which high resolution is used.
Example:
```python
>>> from transformers import MraConfig, MraModel
>>> # Initializing a Mra uw-madison/mra-base-512-4 style configuration
>>> configuration = MraConfig()
>>> # Initializing a model (with random weights) from the uw-madison/mra-base-512-4 style configuration
>>> model = MraModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "mra"
def __init__(
self,
vocab_size=50265,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=1,
initializer_range=0.02,
layer_norm_eps=1e-5,
position_embedding_type="absolute",
block_per_row=4,
approx_mode="full",
initial_prior_first_n_blocks=0,
initial_prior_diagonal_n_blocks=0,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.block_per_row = block_per_row
self.approx_mode = approx_mode
self.initial_prior_first_n_blocks = initial_prior_first_n_blocks
self.initial_prior_diagonal_n_blocks = initial_prior_diagonal_n_blocks
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert MRA checkpoints from the original repository. URL: https://github.com/mlpen/mra-attention"""
import argparse
import torch
from transformers import MraConfig, MraForMaskedLM
def rename_key(orig_key):
if "model" in orig_key:
orig_key = orig_key.replace("model.", "")
if "norm1" in orig_key:
orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
if "norm2" in orig_key:
orig_key = orig_key.replace("norm2", "output.LayerNorm")
if "norm" in orig_key:
orig_key = orig_key.replace("norm", "LayerNorm")
if "transformer" in orig_key:
layer_num = orig_key.split(".")[0].split("_")[-1]
orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
if "mha.attn" in orig_key:
orig_key = orig_key.replace("mha.attn", "attention.self")
if "mha" in orig_key:
orig_key = orig_key.replace("mha", "attention")
if "W_q" in orig_key:
orig_key = orig_key.replace("W_q", "self.query")
if "W_k" in orig_key:
orig_key = orig_key.replace("W_k", "self.key")
if "W_v" in orig_key:
orig_key = orig_key.replace("W_v", "self.value")
if "ff.0" in orig_key:
orig_key = orig_key.replace("ff.0", "intermediate.dense")
if "ff.2" in orig_key:
orig_key = orig_key.replace("ff.2", "output.dense")
if "ff" in orig_key:
orig_key = orig_key.replace("ff", "output.dense")
if "mlm_class" in orig_key:
orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
if "mlm" in orig_key:
orig_key = orig_key.replace("mlm", "cls.predictions.transform")
if "backbone.backbone.encoders" in orig_key:
orig_key = orig_key.replace("backbone.backbone.encoders", "encoder.layer")
if "cls" not in orig_key:
orig_key = "mra." + orig_key
return orig_key
def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
for key in orig_state_dict.copy().keys():
val = orig_state_dict.pop(key)
if ("pooler" in key) or ("sen_class" in key):
continue
else:
orig_state_dict[rename_key(key)] = val
orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
orig_state_dict["mra.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
return orig_state_dict
def convert_mra_checkpoint(checkpoint_path, mra_config_file, pytorch_dump_path):
orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
config = MraConfig.from_json_file(mra_config_file)
model = MraForMaskedLM(config)
new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
print(model.load_state_dict(new_state_dict))
model.eval()
model.save_pretrained(pytorch_dump_path)
print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--pytorch_model_path", default=None, type=str, required=True, help="Path to Mra pytorch checkpoint."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The json file for Mra model config.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_mra_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
This diff is collapsed.
......@@ -4901,6 +4901,58 @@ class MPNetPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"])
MRA_PRETRAINED_MODEL_ARCHIVE_LIST = None
class MraForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraForMultipleChoice(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraForQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MraPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MT5EncoderModel(metaclass=DummyObject):
_backends = ["torch"]
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch MRA model. """
import unittest
from transformers import MraConfig, is_torch_available
from transformers.testing_utils import require_torch, slow, torch_device
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
if is_torch_available():
import torch
from transformers import (
MraForMaskedLM,
MraForMultipleChoice,
MraForQuestionAnswering,
MraForSequenceClassification,
MraForTokenClassification,
MraModel,
)
from transformers.models.mra.modeling_mra import MRA_PRETRAINED_MODEL_ARCHIVE_LIST
class MraModelTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=8,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=16,
num_hidden_layers=5,
num_attention_heads=2,
intermediate_size=36,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = self.get_config()
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def get_config(self):
return MraConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range,
)
def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 300
return config
def prepare_config_and_inputs_for_decoder(self):
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = self.prepare_config_and_inputs()
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return (
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
)
def create_and_check_model(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = MraModel(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = model(input_ids, token_type_ids=token_type_ids)
result = model(input_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_model_as_decoder(
self,
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
encoder_hidden_states,
encoder_attention_mask,
):
config.add_cross_attention = True
model = MraModel(config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
encoder_hidden_states=encoder_hidden_states,
)
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
def create_and_check_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = MraForMaskedLM(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def create_and_check_for_question_answering(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
model = MraForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
result = model(
input_ids,
attention_mask=input_mask,
token_type_ids=token_type_ids,
start_positions=sequence_labels,
end_positions=sequence_labels,
)
self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
def create_and_check_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = MraForSequenceClassification(config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_labels = self.num_labels
model = MraForTokenClassification(config=config)
model.to(torch_device)
model.eval()
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = MraForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
result = model(
multiple_choice_inputs_ids,
attention_mask=multiple_choice_input_mask,
token_type_ids=multiple_choice_token_type_ids,
labels=choice_labels,
)
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(
config,
input_ids,
token_type_ids,
input_mask,
sequence_labels,
token_labels,
choice_labels,
) = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
return config, inputs_dict
@require_torch
class MraModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (
(
MraModel,
MraForMaskedLM,
MraForMultipleChoice,
MraForQuestionAnswering,
MraForSequenceClassification,
MraForTokenClassification,
)
if is_torch_available()
else ()
)
test_pruning = False
test_headmasking = False
test_torchscript = False
has_attentions = False
all_generative_model_classes = ()
def setUp(self):
self.model_tester = MraModelTester(self)
self.config_tester = ConfigTester(self, config_class=MraConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_model_various_embeddings(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
for type in ["absolute", "relative_key", "relative_key_query"]:
config_and_inputs[0].position_embedding_type = type
self.model_tester.create_and_check_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
for model_name in MRA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
model = MraModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@unittest.skip(reason="MRA does not output attentions")
def test_attention_outputs(self):
return
@require_torch
class MraModelIntegrationTest(unittest.TestCase):
@slow
def test_inference_no_head(self):
model = MraModel.from_pretrained("uw-madison/mra-base-512-4")
input_ids = torch.arange(256).unsqueeze(0)
with torch.no_grad():
output = model(input_ids)[0]
expected_shape = torch.Size((1, 256, 768))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[-0.0140, 0.0830, -0.0381], [0.1546, 0.1402, 0.0220], [0.1162, 0.0851, 0.0165]]]
)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@slow
def test_inference_masked_lm(self):
model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-512-4")
input_ids = torch.arange(256).unsqueeze(0)
with torch.no_grad():
output = model(input_ids)[0]
vocab_size = 50265
expected_shape = torch.Size((1, 256, vocab_size))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[9.2595, -3.6038, 11.8819], [9.3869, -3.2693, 11.0956], [11.8524, -3.4938, 13.1210]]]
)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
@slow
def test_inference_masked_lm_long_input(self):
model = MraForMaskedLM.from_pretrained("uw-madison/mra-base-4096-8-d3")
input_ids = torch.arange(4096).unsqueeze(0)
with torch.no_grad():
output = model(input_ids)[0]
vocab_size = 50265
expected_shape = torch.Size((1, 4096, vocab_size))
self.assertEqual(output.shape, expected_shape)
expected_slice = torch.tensor(
[[[5.4789, -2.3564, 7.5064], [7.9067, -1.3369, 9.9668], [9.0712, -1.8106, 7.0380]]]
)
self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment