Fix some TF GPT-J CI testings (#16454)

* Fix for test_mixed_precision * Fix test_saved_model_creation by using shape_list instead of shape * skit test_model_from_pretrained on GPU for now to avoid GPU OOM * skip test_gptj_sample_max_time for now Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Fix some TF GPT-J CI testings (#16454)
* Fix for test_mixed_precision * Fix test_saved_model_creation by using shape_list instead of shape * skit test_model_from_pretrained on GPU for now to avoid GPU OOM * skip test_gptj_sample_max_time for now Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
86cff21c · Yih-Dar · GitHub · aebca696 · 86cff21c · 86cff21c
Unverified Commit 86cff21c authored Mar 29, 2022 by Yih-Dar Committed by GitHub Mar 29, 2022
Showing with 13 additions and 8 deletions

src/transformers/models/gptj/modeling_tf_gptj.py src/transformers/models/gptj/modeling_tf_gptj.py +8 -8

tests/gptj/test_modeling_tf_gptj.py tests/gptj/test_modeling_tf_gptj.py +5 -0

No files found.
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -59,13 +59,13 @@ GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST = [


 def fixed_pos_embedding(x: tf.Tensor, seq_dim: int = 1, seq_len: Optional[int] = None) -> Tuple[tf.Tensor, tf.Tensor]:
-    dim = x.shape[-1]
+    dim = shape_list(x)[-1]
    if seq_len is None:
-        seq_len = x.shape[seq_dim]
+        seq_len = shape_list(x)[seq_dim]
    inv_freq = tf.cast(1.0 / (10000 ** (tf.range(0, dim, 2) / dim)), tf.float32)
    seq_len_range = tf.cast(tf.range(seq_len), tf.float32)
    sinusoid_inp = tf.cast(tf.einsum("i , j -> i j", seq_len_range, inv_freq), tf.float32)
-    return tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)
+    return tf.cast(tf.sin(sinusoid_inp), dtype=x.dtype), tf.cast(tf.cos(sinusoid_inp), dtype=x.dtype)


 def rotate_every_two(x: tf.Tensor) -> tf.Tensor:
@@ -77,8 +77,8 @@ def rotate_every_two(x: tf.Tensor) -> tf.Tensor:

 def apply_rotary_pos_emb(x: tf.Tensor, sincos: tf.Tensor, offset: int = 0) -> tf.Tensor:
    sin_pos, cos_pos = sincos
-    sin_pos = tf.repeat(sin_pos[None, offset : x.shape[1] + offset, None, :], 2, 3)
-    cos_pos = tf.repeat(cos_pos[None, offset : x.shape[1] + offset, None, :], 2, 3)
+    sin_pos = tf.repeat(sin_pos[None, offset : shape_list(x)[1] + offset, None, :], 2, 3)
+    cos_pos = tf.repeat(cos_pos[None, offset : shape_list(x)[1] + offset, None, :], 2, 3)
    return (x * cos_pos) + (rotate_every_two(x) * sin_pos)


@@ -173,7 +173,7 @@ class TFGPTJAttention(tf.keras.layers.Layer):
        head_mask: Optional[tf.Tensor] = None,
    ) -> Tuple[tf.Tensor, tf.Tensor]:
        # compute causal mask from causal mask buffer
-        query_length, key_length = query.shape[-2], key.shape[-2]
+        query_length, key_length = shape_list(query)[-2], shape_list(key)[-2]
        causal_mask = self.get_causal_mask(key_length, query_length)

        # Keep the attention weights computation in fp32 to avoid overflow issues
@@ -218,11 +218,11 @@ class TFGPTJAttention(tf.keras.layers.Layer):
        key = self._split_heads(key, True)
        value = self._split_heads(value, False)

-        seq_len = key.shape[1]
+        seq_len = shape_list(key)[1]
        offset = 0

        if layer_past is not None:
-            offset = layer_past[0].shape[-2]
+            offset = shape_list(layer_past[0])[-2]
            seq_len += offset

        if self.rotary_dim is not None:

--- a/tests/gptj/test_modeling_tf_gptj.py
+++ b/tests/gptj/test_modeling_tf_gptj.py
@@ -345,6 +345,10 @@ class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
                assert name is None

    @slow
+    @unittest.skipIf(
+        not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0,
+        "skip testing on GPU for now to avoid GPU OOM.",
+    )
    def test_model_from_pretrained(self):
        model = TFGPTJModel.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True)
        self.assertIsNotNone(model)
@@ -395,6 +399,7 @@ class TFGPTJModelLanguageGenerationTest(unittest.TestCase):
        )  # token_type_ids should change output

    @slow
+    @unittest.skip(reason="TF generate currently has no time-based stopping criteria")
    def test_gptj_sample_max_time(self):
        tokenizer = AutoTokenizer.from_pretrained("anton-l/gpt-j-tiny-random")
        model = TFGPTJForCausalLM.from_pretrained("anton-l/gpt-j-tiny-random", from_pt=True)