Unverified Commit b0d539cc authored by Jannis Vamvas's avatar Jannis Vamvas Committed by GitHub
Browse files

Add X-MOD (#20939)



* Add X-MOD to Readme

* Add documentation for X-MOD

* Implement X-MOD

* Fix formatting of X-MOD docs

* Change signature of X-MOD forward methods to use lang_ids

* Minor changes

* Rebase with main and run make fix-copies

* Make suggested changes to docstrings

* Improve code readability
Co-authored-by: default avatarYounes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Fix code style

* Conversion script: Remove asserts and type annotations

* Remove _TOKENIZER_FOR_DOC

* XMOD -> Xmod

* Update copyright note

* Fix doctests

* Fix docstring

* Add integration test for FillMaskPipeline

* Revert "Add integration test for FillMaskPipeline"

This reverts commit 4381eb3b1d0f5d85785f89caba83928e6efa6d1f.

* Add end-to-end integration test for mask fill

* make style

* Rebase with main and make fix-copies

---------
Co-authored-by: default avatarYounes Belkada <49240599+younesbelkada@users.noreply.github.com>
parent adb2503e
......@@ -184,6 +184,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaModel"),
("xlm-roberta-xl", "XLMRobertaXLModel"),
("xlnet", "XLNetModel"),
("xmod", "XmodModel"),
("yolos", "YolosModel"),
("yoso", "YosoModel"),
]
......@@ -244,6 +245,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForMaskedLM"),
("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
("xlnet", "XLNetLMHeadModel"),
("xmod", "XmodForMaskedLM"),
]
)
......@@ -317,6 +319,7 @@ MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForMaskedLM"),
("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
("xlnet", "XLNetLMHeadModel"),
("xmod", "XmodForMaskedLM"),
("yoso", "YosoForMaskedLM"),
]
)
......@@ -371,6 +374,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForCausalLM"),
("xlm-roberta-xl", "XLMRobertaXLForCausalLM"),
("xlnet", "XLNetLMHeadModel"),
("xmod", "XmodForCausalLM"),
]
)
......@@ -531,6 +535,7 @@ MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
("xlm", "XLMWithLMHeadModel"),
("xlm-roberta", "XLMRobertaForMaskedLM"),
("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
("xmod", "XmodForMaskedLM"),
("yoso", "YosoForMaskedLM"),
]
)
......@@ -658,6 +663,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForSequenceClassification"),
("xlm-roberta-xl", "XLMRobertaXLForSequenceClassification"),
("xlnet", "XLNetForSequenceClassification"),
("xmod", "XmodForSequenceClassification"),
("yoso", "YosoForSequenceClassification"),
]
)
......@@ -714,6 +720,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForQuestionAnswering"),
("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"),
("xlnet", "XLNetForQuestionAnsweringSimple"),
("xmod", "XmodForQuestionAnswering"),
("yoso", "YosoForQuestionAnswering"),
]
)
......@@ -785,6 +792,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForTokenClassification"),
("xlm-roberta-xl", "XLMRobertaXLForTokenClassification"),
("xlnet", "XLNetForTokenClassification"),
("xmod", "XmodForTokenClassification"),
("yoso", "YosoForTokenClassification"),
]
)
......@@ -825,6 +833,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
("xlm-roberta", "XLMRobertaForMultipleChoice"),
("xlm-roberta-xl", "XLMRobertaXLForMultipleChoice"),
("xlnet", "XLNetForMultipleChoice"),
("xmod", "XmodForMultipleChoice"),
("yoso", "YosoForMultipleChoice"),
]
)
......
......@@ -325,6 +325,13 @@ else:
"XLNetTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"xmod",
(
"XLMRobertaTokenizer" if is_sentencepiece_available() else None,
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
),
),
(
"yoso",
(
......
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
_import_structure = {
"configuration_xmod": [
"XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP",
"XmodConfig",
"XmodOnnxConfig",
],
}
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
_import_structure["modeling_xmod"] = [
"XMOD_PRETRAINED_MODEL_ARCHIVE_LIST",
"XmodForCausalLM",
"XmodForMaskedLM",
"XmodForMultipleChoice",
"XmodForQuestionAnswering",
"XmodForSequenceClassification",
"XmodForTokenClassification",
"XmodModel",
"XmodPreTrainedModel",
]
if TYPE_CHECKING:
from .configuration_xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig, XmodOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
from .modeling_xmod import (
XMOD_PRETRAINED_MODEL_ARCHIVE_LIST,
XmodForCausalLM,
XmodForMaskedLM,
XmodForMultipleChoice,
XmodForQuestionAnswering,
XmodForSequenceClassification,
XmodForTokenClassification,
XmodModel,
XmodPreTrainedModel,
)
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
# coding=utf-8
# Copyright 2023 The Meta AI Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" X-MOD configuration"""
from collections import OrderedDict
from typing import Mapping
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging
logger = logging.get_logger(__name__)
XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"jvamvas/xmod-base": "https://huggingface.co/jvamvas/xmod-base/resolve/main/config.json",
"jvamvas/xmod-large-prenorm": "https://huggingface.co/jvamvas/xmod-large-prenorm/resolve/main/config.json",
"jvamvas/xmod-base-13-125k": "https://huggingface.co/jvamvas/xmod-base-13-125k/resolve/main/config.json",
"jvamvas/xmod-base-30-125k": "https://huggingface.co/jvamvas/xmod-base-30-125k/resolve/main/config.json",
"jvamvas/xmod-base-30-195k": "https://huggingface.co/jvamvas/xmod-base-30-195k/resolve/main/config.json",
"jvamvas/xmod-base-60-125k": "https://huggingface.co/jvamvas/xmod-base-60-125k/resolve/main/config.json",
"jvamvas/xmod-base-60-265k": "https://huggingface.co/jvamvas/xmod-base-60-265k/resolve/main/config.json",
"jvamvas/xmod-base-75-125k": "https://huggingface.co/jvamvas/xmod-base-75-125k/resolve/main/config.json",
"jvamvas/xmod-base-75-269k": "https://huggingface.co/jvamvas/xmod-base-75-269k/resolve/main/config.json",
}
class XmodConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the [xmod-base](https://huggingface.co/jvamvas/xmod-base)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 30522):
Vocabulary size of the X-MOD model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`XmodModel`].
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (`int`, *optional*, defaults to 512):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (`int`, *optional*, defaults to 2):
The vocabulary size of the `token_type_ids` passed when calling [`XmodModel`].
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the layer normalization layers.
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
is_decoder (`bool`, *optional*, defaults to `False`):
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
classifier_dropout (`float`, *optional*):
The dropout ratio for the classification head.
pre_norm (`bool`, *optional*, defaults to `False`):
Whether to apply layer normalization before each block.
adapter_reduction_factor (`int` or `float`, *optional*, defaults to 2):
The factor by which the dimensionality of the adapter is reduced relative to `hidden_size`.
adapter_layer_norm (`bool`, *optional*, defaults to `False`):
Whether to apply a new layer normalization before the adapter modules (shared across all adapters).
adapter_reuse_layer_norm (`bool`, *optional*, defaults to `True`):
Whether to reuse the second layer normalization and apply it before the adapter modules as well.
ln_before_adapter (`bool`, *optional*, defaults to `True`):
Whether to apply the layer normalization before the residual connection around the adapter module.
languages (`Iterable[str]`, *optional*, defaults to `["en_XX"]`):
An iterable of language codes for which adapter modules should be initialized.
default_language (`str`, *optional*):
Language code of a default language. It will be assumed that the input is in this language if no language
codes are explicitly passed to the forward method.
Examples:
```python
>>> from transformers import XmodConfig, XmodModel
>>> # Initializing an X-MOD jvamvas/xmod-base style configuration
>>> configuration = XmodConfig()
>>> # Initializing a model (with random weights) from the jvamvas/xmod-base style configuration
>>> model = XmodModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "xmod"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
position_embedding_type="absolute",
use_cache=True,
classifier_dropout=None,
pre_norm=False,
adapter_reduction_factor=2,
adapter_layer_norm=False,
adapter_reuse_layer_norm=True,
ln_before_adapter=True,
languages=("en_XX",),
default_language=None,
**kwargs,
):
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.position_embedding_type = position_embedding_type
self.use_cache = use_cache
self.classifier_dropout = classifier_dropout
self.pre_norm = pre_norm
self.adapter_reduction_factor = adapter_reduction_factor
self.adapter_layer_norm = adapter_layer_norm
self.adapter_reuse_layer_norm = adapter_reuse_layer_norm
self.ln_before_adapter = ln_before_adapter
self.languages = list(languages)
self.default_language = default_language
# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->Xmod
class XmodOnnxConfig(OnnxConfig):
@property
def inputs(self) -> Mapping[str, Mapping[int, str]]:
if self.task == "multiple-choice":
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
else:
dynamic_axis = {0: "batch", 1: "sequence"}
return OrderedDict(
[
("input_ids", dynamic_axis),
("attention_mask", dynamic_axis),
]
)
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert X-MOD checkpoint."""
import argparse
from pathlib import Path
import fairseq
import torch
from fairseq.models.xmod import XMODModel as FairseqXmodModel
from packaging import version
from transformers import XmodConfig, XmodForMaskedLM, XmodForSequenceClassification
from transformers.utils import logging
if version.parse(fairseq.__version__) < version.parse("0.12.2"):
raise Exception("requires fairseq >= 0.12.2")
if version.parse(fairseq.__version__) > version.parse("2"):
raise Exception("requires fairseq < v2")
logging.set_verbosity_info()
logger = logging.get_logger(__name__)
SAMPLE_TEXT = "Hello, World!"
SAMPLE_LANGUAGE = "en_XX"
def convert_xmod_checkpoint_to_pytorch(
xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
):
data_dir = Path("data_bin")
xmod = FairseqXmodModel.from_pretrained(
model_name_or_path=str(Path(xmod_checkpoint_path).parent),
checkpoint_file=Path(xmod_checkpoint_path).name,
_name="xmod_base",
arch="xmod_base",
task="multilingual_masked_lm",
data_name_or_path=str(data_dir),
bpe="sentencepiece",
sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
src_dict=str(data_dir / "dict.txt"),
)
xmod.eval() # disable dropout
print(xmod)
xmod_sent_encoder = xmod.model.encoder.sentence_encoder
config = XmodConfig(
vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
hidden_size=xmod.cfg.model.encoder_embed_dim,
num_hidden_layers=xmod.cfg.model.encoder_layers,
num_attention_heads=xmod.cfg.model.encoder_attention_heads,
intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
max_position_embeddings=514,
type_vocab_size=1,
layer_norm_eps=1e-5, # PyTorch default used in fairseq
pre_norm=xmod.cfg.model.encoder_normalize_before,
adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
ln_before_adapter=xmod.cfg.model.ln_before_adapter,
languages=xmod.cfg.model.languages,
)
if classification_head:
config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
print("Our X-MOD config:", config)
model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
model.eval()
# Now let's copy all the weights.
# Embeddings
model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
model.roberta.embeddings.token_type_embeddings.weight
) # just zero them out b/c xmod doesn't use them.
model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
for i in range(config.num_hidden_layers):
# Encoder: start of layer
layer = model.roberta.encoder.layer[i]
xmod_layer = xmod_sent_encoder.layers[i]
# self attention
self_attn = layer.attention.self
if not (
xmod_layer.self_attn.k_proj.weight.data.shape
== xmod_layer.self_attn.q_proj.weight.data.shape
== xmod_layer.self_attn.v_proj.weight.data.shape
== torch.Size((config.hidden_size, config.hidden_size))
):
raise AssertionError("Dimensions of self-attention weights do not match.")
self_attn.query.weight.data = xmod_layer.self_attn.q_proj.weight
self_attn.query.bias.data = xmod_layer.self_attn.q_proj.bias
self_attn.key.weight.data = xmod_layer.self_attn.k_proj.weight
self_attn.key.bias.data = xmod_layer.self_attn.k_proj.bias
self_attn.value.weight.data = xmod_layer.self_attn.v_proj.weight
self_attn.value.bias.data = xmod_layer.self_attn.v_proj.bias
# self-attention output
self_output = layer.attention.output
if self_output.dense.weight.shape != xmod_layer.self_attn.out_proj.weight.shape:
raise AssertionError("Dimensions of self-attention output weights do not match.")
self_output.dense.weight = xmod_layer.self_attn.out_proj.weight
self_output.dense.bias = xmod_layer.self_attn.out_proj.bias
self_output.LayerNorm.weight = xmod_layer.self_attn_layer_norm.weight
self_output.LayerNorm.bias = xmod_layer.self_attn_layer_norm.bias
# intermediate
intermediate = layer.intermediate
if intermediate.dense.weight.shape != xmod_layer.fc1.weight.shape:
raise AssertionError("Dimensions of intermediate weights do not match.")
intermediate.dense.weight = xmod_layer.fc1.weight
intermediate.dense.bias = xmod_layer.fc1.bias
# output
bert_output = layer.output
if bert_output.dense.weight.shape != xmod_layer.fc2.weight.shape:
raise AssertionError("Dimensions of feed-forward weights do not match.")
bert_output.dense.weight = xmod_layer.fc2.weight
bert_output.dense.bias = xmod_layer.fc2.bias
bert_output.LayerNorm.weight = xmod_layer.final_layer_norm.weight
bert_output.LayerNorm.bias = xmod_layer.final_layer_norm.bias
if bert_output.adapter_layer_norm is not None:
bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
if list(sorted(bert_output.adapter_modules.keys())) != list(sorted(xmod_layer.adapter_modules.keys())):
raise AssertionError("Lists of language adapters do not match.")
for lang_code, adapter in xmod_layer.adapter_modules.items():
to_adapter = bert_output.adapter_modules[lang_code]
from_adapter = xmod_layer.adapter_modules[lang_code]
to_adapter.dense1.weight = from_adapter.fc1.weight
to_adapter.dense1.bias = from_adapter.fc1.bias
to_adapter.dense2.weight = from_adapter.fc2.weight
to_adapter.dense2.bias = from_adapter.fc2.bias
# end of layer
if xmod_sent_encoder.layer_norm is not None:
model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
if classification_head:
model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
else:
# LM Head
model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
# Let's check that we get the same results.
input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
model.roberta.set_default_language(SAMPLE_LANGUAGE)
our_output = model(input_ids)[0]
if classification_head:
their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
else:
their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
success = torch.allclose(our_output, their_output, atol=1e-3)
print("Do both models output the same tensors?", "🔥" if success else "💩")
if not success:
raise Exception("Something went wRoNg")
Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
print(f"Saving model to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
)
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
parser.add_argument(
"--classification_head", action="store_true", help="Whether to convert a final classification head."
)
args = parser.parse_args()
convert_xmod_checkpoint_to_pytorch(
args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
)
This diff is collapsed.
......@@ -6937,6 +6937,65 @@ def load_tf_weights_in_xlnet(*args, **kwargs):
requires_backends(load_tf_weights_in_xlnet, ["torch"])
XMOD_PRETRAINED_MODEL_ARCHIVE_LIST = None
class XmodForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodForMultipleChoice(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodForQuestionAnswering(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodForSequenceClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodForTokenClassification(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class XmodPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
This diff is collapsed.
......@@ -217,6 +217,8 @@ src/transformers/models/xlm/configuration_xlm.py
src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
src/transformers/models/xlnet/configuration_xlnet.py
src/transformers/models/xmod/configuration_xmod.py
src/transformers/models/xmod/modeling_xmod.py
src/transformers/models/yolos/configuration_yolos.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment