[Flax] Correctly Add MT5 (#12988)

* finish PR * finish mt5 * push * up * Update tests/test_modeling_flax_mt5.py Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Suraj Patil <surajp815@gmail.com>

[Flax] Correctly Add MT5 (#12988)
* finish PR * finish mt5 * push * up * Update tests/test_modeling_flax_mt5.py Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Suraj Patil <surajp815@gmail.com>
a317e6c3 · Patrick von Platen · GitHub · da9754a3 · a317e6c3 · a317e6c3
Unverified Commit a317e6c3 authored Aug 04, 2021 by Patrick von Platen Committed by GitHub Aug 04, 2021
9 changed files
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -428,7 +428,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 .. toctree::

--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -94,3 +94,17 @@ TFMT5EncoderModel
 .. autoclass:: transformers.TFMT5EncoderModel
    :members:
+FlaxMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.FlaxMT5Model
+    :members:
+FlaxMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.FlaxMT5ForConditionalGeneration
+    :members:
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1691,6 +1691,7 @@ if is_flax_available():
            "FlaxMBartPreTrainedModel",
        ]
    )
+    _import_structure["models.mt5"].extend(["FlaxMT5ForConditionalGeneration", "FlaxMT5Model"])
    _import_structure["models.roberta"].extend(
        [
            "FlaxRobertaForMaskedLM",
@@ -3120,6 +3121,7 @@ if TYPE_CHECKING:
            FlaxMBartModel,
            FlaxMBartPreTrainedModel,
        )
+        from .models.mt5 import FlaxMT5ForConditionalGeneration, FlaxMT5Model
        from .models.roberta import (
            FlaxRobertaForMaskedLM,
            FlaxRobertaForMultipleChoice,

--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -62,6 +62,7 @@ from ..mbart.modeling_flax_mbart import (
    FlaxMBartForSequenceClassification,
    FlaxMBartModel,
 )
+from ..mt5.modeling_flax_mt5 import FlaxMT5ForConditionalGeneration, FlaxMT5Model
 from ..roberta.modeling_flax_roberta import (
    FlaxRobertaForMaskedLM,
    FlaxRobertaForMultipleChoice,
@@ -109,7 +110,7 @@ FLAX_MODEL_MAPPING = OrderedDict(
        (ViTConfig, FlaxViTModel),
        (MBartConfig, FlaxMBartModel),
        (T5Config, FlaxT5Model),
-        (MT5Config, FlaxT5Model),
+        (MT5Config, FlaxMT5Model),
        (Wav2Vec2Config, FlaxWav2Vec2Model),
        (MarianConfig, FlaxMarianModel),
    ]
@@ -125,7 +126,7 @@ FLAX_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
        (ElectraConfig, FlaxElectraForPreTraining),
        (MBartConfig, FlaxMBartForConditionalGeneration),
        (T5Config, FlaxT5ForConditionalGeneration),
-        (MT5Config, FlaxT5ForConditionalGeneration),
+        (MT5Config, FlaxMT5ForConditionalGeneration),
        (Wav2Vec2Config, FlaxWav2Vec2ForPreTraining),
    ]
 )
@@ -147,7 +148,7 @@ FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
        # Model for Seq2Seq Causal LM mapping
        (BartConfig, FlaxBartForConditionalGeneration),
        (T5Config, FlaxT5ForConditionalGeneration),
-        (MT5Config, FlaxT5ForConditionalGeneration),
+        (MT5Config, FlaxMT5ForConditionalGeneration),
        (MarianConfig, FlaxMarianMTModel),
    ]
 )

--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -20,6 +20,7 @@ from typing import TYPE_CHECKING
 from ...file_utils import (
    _LazyModule,
+    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
@@ -51,6 +52,9 @@ if is_torch_available():
 if is_tf_available():
    _import_structure["modeling_tf_mt5"] = ["TFMT5EncoderModel", "TFMT5ForConditionalGeneration", "TFMT5Model"]
+if is_flax_available():
+    _import_structure["modeling_flax_mt5"] = ["FlaxMT5ForConditionalGeneration", "FlaxMT5Model"]
 if TYPE_CHECKING:
    from .configuration_mt5 import MT5Config
@@ -61,6 +65,9 @@ if TYPE_CHECKING:
    if is_tf_available():
        from .modeling_tf_mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model
+    if is_flax_available():
+        from .modeling_flax_mt5 import FlaxMT5ForConditionalGeneration, FlaxMT5Model
 else:
    import sys

--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
+# coding=utf-8
+# Copyright 2021 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax mT5 model. """
+from ...utils import logging
+from ..t5.modeling_flax_t5 import FlaxT5ForConditionalGeneration, FlaxT5Model
+from .configuration_mt5 import MT5Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "T5Config"
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+class FlaxMT5Model(FlaxT5Model):
+    r"""
+    This class overrides :class:`~transformers.FlaxT5Model`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    Examples::
+        >>> from transformers import FlaxMT5Model, T5Tokenizer
+        >>> model = FlaxMT5Model.from_pretrained("google/mt5-small")
+        >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+        >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+        >>> summary = "Weiter Verhandlung in Syrien."
+        >>> inputs = tokenizer(article, return_tensors="np")
+        >>> with tokenizer.as_target_tokenizer():
+        ...     decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+        >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
+        >>> hidden_states = outputs.last_hidden_state
+    """
+    model_type = "mt5"
+    config_class = MT5Config
+class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
+    r"""
+    This class overrides :class:`~transformers.FlaxT5ForConditionalGeneration`. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    Examples::
+        >>> from transformers import FlaxMT5ForConditionalGeneration, T5Tokenizer
+        >>> model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+        >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
+        >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
+        >>> summary = "Weiter Verhandlung in Syrien."
+        >>> inputs = tokenizer(article, return_tensors="np")
+        >>> with tokenizer.as_target_tokenizer():
+        ...     decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+        >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
+        >>> logits = outputs.logits
+    """
+    model_type = "mt5"
+    config_class = MT5Config
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -642,6 +642,24 @@ class FlaxMBartPreTrainedModel:
        requires_backends(cls, ["flax"])
+class FlaxMT5ForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+class FlaxMT5Model:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
 class FlaxRobertaForMaskedLM:
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["flax"])

--- a/tests/test_modeling_flax_mt5.py
+++ b/tests/test_modeling_flax_mt5.py
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers import is_flax_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+if is_flax_available():
+    import optax
+    from flax.training.common_utils import onehot
+    from transformers import AutoTokenizer, FlaxMT5ForConditionalGeneration
+    from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MT5IntegrationTest(unittest.TestCase):
+    @slow
+    def test_small_integration_test(self):
+        """
+        For comparision run:
+        >>> import t5  # pip install t5==0.7.1
+        >>> from t5.data.sentencepiece_vocabulary import SentencePieceVocabulary
+        >>> path_to_mtf_small_mt5_checkpoint = '<fill_in>'
+        >>> path_to_mtf_small_mt5_spm_model_path = '<fill_in>'
+        >>> t5_model = t5.models.MtfModel(model_dir=path_to_mtf_small_mt5_checkpoint, batch_size=1, tpu=None)
+        >>> vocab = SentencePieceVocabulary(path_to_mtf_small_mt5_spm_model_path)
+        >>> score = t5_model.score(inputs=["Hello there"], targets=["Hi I am"], vocabulary=vocab)
+        """
+        model = FlaxMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+        tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
+        input_ids = tokenizer("Hello there", return_tensors="np").input_ids
+        labels = tokenizer("Hi I am", return_tensors="np").input_ids
+        decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id, model.config.decoder_start_token_id)
+        logits = model(input_ids, decoder_input_ids=decoder_input_ids).logits
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
+        mtf_score = -(labels.shape[-1] * loss.item())
+        EXPECTED_SCORE = -84.9127
+        self.assertTrue(abs(mtf_score - EXPECTED_SCORE) < 1e-4)
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -82,8 +82,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
 # trigger the common tests.
 TEST_FILES_WITH_NO_COMMON_TESTS = [
    "test_modeling_camembert.py",
-    "test_modeling_flax_bert.py",
+    "test_modeling_flax_mt5.py",
-    "test_modeling_flax_roberta.py",
    "test_modeling_mbart.py",
    "test_modeling_mt5.py",
    "test_modeling_pegasus.py",