[ProphetNet] Bart-like Refactor (#10501)

* first step to refactor * make all fast tests pass * make all slow tests pass * save intermediate * correct cache * finish PR * make fp16 work

[ProphetNet] Bart-like Refactor (#10501)
* first step to refactor * make all fast tests pass * make all slow tests pass * save intermediate * correct cache * finish PR * make fp16 work
c503a1c1 · Patrick von Platen · GitHub · 6290169e · c503a1c1 · c503a1c1
Unverified Commit c503a1c1 authored Mar 04, 2021 by Patrick von Platen Committed by GitHub Mar 04, 2021
3 changed files
--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -92,6 +92,8 @@ class ProphetNetConfig(PretrainedConfig):
            smoothing is performed.
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
    """
    model_type = "prophetnet"
    keys_to_ignore_at_inference = ["past_key_values"]
@@ -119,6 +121,7 @@ class ProphetNetConfig(PretrainedConfig):
        num_buckets=32,
        relative_max_distance=128,
        disable_ngram_loss=False,
+        gradient_checkpointing=False,
        eps=0.0,
        use_cache=True,
        pad_token_id=0,
@@ -161,6 +164,9 @@ class ProphetNetConfig(PretrainedConfig):
        self.use_cache = use_cache
+        # 4 Training Args (should be removed soon)
+        self.gradient_checkpointing = gradient_checkpointing
    @property
    def num_attention_heads(self) -> int:
        return self.num_encoder_attention_heads

--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
--- a/tests/test_modeling_prophetnet.py
+++ b/tests/test_modeling_prophetnet.py
@@ -243,7 +243,7 @@ class ProphetNetModelTester:
        # There should be `num_layers` key value embeddings stored in decoder_past
        self.parent.assertEqual(len(decoder_past), config.num_decoder_layers)
        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
-        self.parent.assertEqual(len(decoder_past[0]), 2)  # cross-attention + uni-directional self-attention
+        self.parent.assertEqual(len(decoder_past[0]), 4)  # cross-attention + uni-directional self-attention
    def create_and_check_with_lm_head(
        self,