Unverified Commit 0afe4a90 authored by Patrick Fernandes's avatar Patrick Fernandes Committed by GitHub
Browse files

[Flax] Add Electra models (#11426)



* add electra model to flax

* Remove Electra Next Sentence Prediction model added by mistake

* fix parameter sharing and loosen equality threshold

* fix styling issues

* add mistaken removen imports

* fix electra table

* Add FlaxElectra to automodels and fixe docs

* fix issues pointed out the PR

* fix flax electra to comply with latest changes

* remove stale class

* add copied from
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent 226e74b6
...@@ -295,7 +295,7 @@ Flax), PyTorch, and/or TensorFlow. ...@@ -295,7 +295,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ❌ | | DistilBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ELECTRA | ✅ | ✅ | ✅ | ✅ | | | ELECTRA | ✅ | ✅ | ✅ | ✅ | |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ | | Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
......
...@@ -185,3 +185,52 @@ TFElectraForQuestionAnswering ...@@ -185,3 +185,52 @@ TFElectraForQuestionAnswering
.. autoclass:: transformers.TFElectraForQuestionAnswering .. autoclass:: transformers.TFElectraForQuestionAnswering
:members: call :members: call
FlaxElectraModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraModel
:members: __call__
FlaxElectraForPreTraining
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForPreTraining
:members: __call__
FlaxElectraForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForMaskedLM
:members: __call__
FlaxElectraForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForSequenceClassification
:members: __call__
FlaxElectraForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForMultipleChoice
:members: __call__
FlaxElectraForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForTokenClassification
:members: __call__
FlaxElectraForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxElectraForQuestionAnswering
:members: __call__
...@@ -1403,6 +1403,18 @@ if is_flax_available(): ...@@ -1403,6 +1403,18 @@ if is_flax_available():
"FlaxBertPreTrainedModel", "FlaxBertPreTrainedModel",
] ]
) )
_import_structure["models.electra"].extend(
[
"FlaxElectraForMaskedLM",
"FlaxElectraForMultipleChoice",
"FlaxElectraForPreTraining",
"FlaxElectraForQuestionAnswering",
"FlaxElectraForSequenceClassification",
"FlaxElectraForTokenClassification",
"FlaxElectraModel",
"FlaxElectraPreTrainedModel",
]
)
_import_structure["models.roberta"].extend( _import_structure["models.roberta"].extend(
[ [
"FlaxRobertaForMaskedLM", "FlaxRobertaForMaskedLM",
...@@ -2585,6 +2597,16 @@ if TYPE_CHECKING: ...@@ -2585,6 +2597,16 @@ if TYPE_CHECKING:
FlaxBertModel, FlaxBertModel,
FlaxBertPreTrainedModel, FlaxBertPreTrainedModel,
) )
from .models.electra import (
FlaxElectraForMaskedLM,
FlaxElectraForMultipleChoice,
FlaxElectraForPreTraining,
FlaxElectraForQuestionAnswering,
FlaxElectraForSequenceClassification,
FlaxElectraForTokenClassification,
FlaxElectraModel,
FlaxElectraPreTrainedModel,
)
from .models.roberta import ( from .models.roberta import (
FlaxRobertaForMaskedLM, FlaxRobertaForMaskedLM,
FlaxRobertaForMultipleChoice, FlaxRobertaForMultipleChoice,
......
...@@ -28,6 +28,15 @@ from ..bert.modeling_flax_bert import ( ...@@ -28,6 +28,15 @@ from ..bert.modeling_flax_bert import (
FlaxBertForTokenClassification, FlaxBertForTokenClassification,
FlaxBertModel, FlaxBertModel,
) )
from ..electra.modeling_flax_electra import (
FlaxElectraForMaskedLM,
FlaxElectraForMultipleChoice,
FlaxElectraForPreTraining,
FlaxElectraForQuestionAnswering,
FlaxElectraForSequenceClassification,
FlaxElectraForTokenClassification,
FlaxElectraModel,
)
from ..roberta.modeling_flax_roberta import ( from ..roberta.modeling_flax_roberta import (
FlaxRobertaForMaskedLM, FlaxRobertaForMaskedLM,
FlaxRobertaForMultipleChoice, FlaxRobertaForMultipleChoice,
...@@ -37,7 +46,7 @@ from ..roberta.modeling_flax_roberta import ( ...@@ -37,7 +46,7 @@ from ..roberta.modeling_flax_roberta import (
FlaxRobertaModel, FlaxRobertaModel,
) )
from .auto_factory import auto_class_factory from .auto_factory import auto_class_factory
from .configuration_auto import BertConfig, RobertaConfig from .configuration_auto import BertConfig, ElectraConfig, RobertaConfig
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
...@@ -48,6 +57,7 @@ FLAX_MODEL_MAPPING = OrderedDict( ...@@ -48,6 +57,7 @@ FLAX_MODEL_MAPPING = OrderedDict(
# Base model mapping # Base model mapping
(RobertaConfig, FlaxRobertaModel), (RobertaConfig, FlaxRobertaModel),
(BertConfig, FlaxBertModel), (BertConfig, FlaxBertModel),
(ElectraConfig, FlaxElectraModel),
] ]
) )
...@@ -56,6 +66,7 @@ FLAX_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( ...@@ -56,6 +66,7 @@ FLAX_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
# Model for pre-training mapping # Model for pre-training mapping
(RobertaConfig, FlaxRobertaForMaskedLM), (RobertaConfig, FlaxRobertaForMaskedLM),
(BertConfig, FlaxBertForPreTraining), (BertConfig, FlaxBertForPreTraining),
(ElectraConfig, FlaxElectraForPreTraining),
] ]
) )
...@@ -64,6 +75,7 @@ FLAX_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( ...@@ -64,6 +75,7 @@ FLAX_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
# Model for Masked LM mapping # Model for Masked LM mapping
(RobertaConfig, FlaxRobertaForMaskedLM), (RobertaConfig, FlaxRobertaForMaskedLM),
(BertConfig, FlaxBertForMaskedLM), (BertConfig, FlaxBertForMaskedLM),
(ElectraConfig, FlaxElectraForMaskedLM),
] ]
) )
...@@ -72,6 +84,7 @@ FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -72,6 +84,7 @@ FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
# Model for Sequence Classification mapping # Model for Sequence Classification mapping
(RobertaConfig, FlaxRobertaForSequenceClassification), (RobertaConfig, FlaxRobertaForSequenceClassification),
(BertConfig, FlaxBertForSequenceClassification), (BertConfig, FlaxBertForSequenceClassification),
(ElectraConfig, FlaxElectraForSequenceClassification),
] ]
) )
...@@ -80,6 +93,7 @@ FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( ...@@ -80,6 +93,7 @@ FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
# Model for Question Answering mapping # Model for Question Answering mapping
(RobertaConfig, FlaxRobertaForQuestionAnswering), (RobertaConfig, FlaxRobertaForQuestionAnswering),
(BertConfig, FlaxBertForQuestionAnswering), (BertConfig, FlaxBertForQuestionAnswering),
(ElectraConfig, FlaxElectraForQuestionAnswering),
] ]
) )
...@@ -88,6 +102,7 @@ FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -88,6 +102,7 @@ FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
# Model for Token Classification mapping # Model for Token Classification mapping
(RobertaConfig, FlaxRobertaForTokenClassification), (RobertaConfig, FlaxRobertaForTokenClassification),
(BertConfig, FlaxBertForTokenClassification), (BertConfig, FlaxBertForTokenClassification),
(ElectraConfig, FlaxElectraForTokenClassification),
] ]
) )
...@@ -96,6 +111,7 @@ FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( ...@@ -96,6 +111,7 @@ FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
# Model for Multiple Choice mapping # Model for Multiple Choice mapping
(RobertaConfig, FlaxRobertaForMultipleChoice), (RobertaConfig, FlaxRobertaForMultipleChoice),
(BertConfig, FlaxBertForMultipleChoice), (BertConfig, FlaxBertForMultipleChoice),
(ElectraConfig, FlaxElectraForMultipleChoice),
] ]
) )
......
...@@ -18,7 +18,13 @@ ...@@ -18,7 +18,13 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available from ...file_utils import (
_BaseLazyModule,
is_flax_available,
is_tf_available,
is_tokenizers_available,
is_torch_available,
)
_import_structure = { _import_structure = {
...@@ -56,6 +62,18 @@ if is_tf_available(): ...@@ -56,6 +62,18 @@ if is_tf_available():
"TFElectraPreTrainedModel", "TFElectraPreTrainedModel",
] ]
if is_flax_available():
_import_structure["modeling_flax_electra"] = [
"FlaxElectraForMaskedLM",
"FlaxElectraForMultipleChoice",
"FlaxElectraForPreTraining",
"FlaxElectraForQuestionAnswering",
"FlaxElectraForSequenceClassification",
"FlaxElectraForTokenClassification",
"FlaxElectraModel",
"FlaxElectraPreTrainedModel",
]
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
...@@ -91,6 +109,18 @@ if TYPE_CHECKING: ...@@ -91,6 +109,18 @@ if TYPE_CHECKING:
TFElectraPreTrainedModel, TFElectraPreTrainedModel,
) )
if is_flax_available():
from .modeling_flax_electra import (
FlaxElectraForMaskedLM,
FlaxElectraForMultipleChoice,
FlaxElectraForPreTraining,
FlaxElectraForQuestionAnswering,
FlaxElectraForSequenceClassification,
FlaxElectraForTokenClassification,
FlaxElectraModel,
FlaxElectraPreTrainedModel,
)
else: else:
import importlib import importlib
import os import os
......
This diff is collapsed.
...@@ -180,6 +180,74 @@ class FlaxBertPreTrainedModel: ...@@ -180,6 +180,74 @@ class FlaxBertPreTrainedModel:
requires_backends(self, ["flax"]) requires_backends(self, ["flax"])
class FlaxElectraForMaskedLM:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraForMultipleChoice:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraForPreTraining:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraForQuestionAnswering:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraForSequenceClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraForTokenClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxElectraPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"])
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_backends(self, ["flax"])
class FlaxRobertaForMaskedLM: class FlaxRobertaForMaskedLM:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_backends(self, ["flax"]) requires_backends(self, ["flax"])
......
import unittest
import numpy as np
from transformers import ElectraConfig, is_flax_available
from transformers.testing_utils import require_flax, slow
from .test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
if is_flax_available():
from transformers.models.electra.modeling_flax_electra import (
FlaxElectraForMaskedLM,
FlaxElectraForMultipleChoice,
FlaxElectraForPreTraining,
FlaxElectraForQuestionAnswering,
FlaxElectraForSequenceClassification,
FlaxElectraForTokenClassification,
FlaxElectraModel,
)
class FlaxElectraModelTester(unittest.TestCase):
def __init__(
self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_attention_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
embedding_size=24,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_choices=4,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.embedding_size = embedding_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_choices = num_choices
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
config = ElectraConfig(
vocab_size=self.vocab_size,
hidden_size=self.hidden_size,
embedding_size=self.embedding_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
)
return config, input_ids, token_type_ids, attention_mask
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, token_type_ids, attention_mask = config_and_inputs
inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
return config, inputs_dict
@require_flax
class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase):
all_model_classes = (
(
FlaxElectraModel,
FlaxElectraForMaskedLM,
FlaxElectraForPreTraining,
FlaxElectraForTokenClassification,
FlaxElectraForQuestionAnswering,
FlaxElectraForMultipleChoice,
FlaxElectraForSequenceClassification,
)
if is_flax_available()
else ()
)
def setUp(self):
self.model_tester = FlaxElectraModelTester(self)
@slow
def test_model_from_pretrained(self):
for model_class_name in self.all_model_classes:
if model_class_name == FlaxElectraForMaskedLM:
model = model_class_name.from_pretrained("google/electra-small-generator")
else:
model = model_class_name.from_pretrained("google/electra-small-discriminator")
outputs = model(np.ones((1, 1)))
self.assertIsNotNone(outputs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment