[JAX] Scale sequence length in CP tests to avoid tiny sizes. (#1347)

Scale sequence length in CP tests to avoid tiny sizes. Signed-off-by: Michael Goldfarb <mgoldfarb@nvidia.com>

[JAX] Scale sequence length in CP tests to avoid tiny sizes. (#1347)
Scale sequence length in CP tests to avoid tiny sizes. Signed-off-by: Michael Goldfarb <mgoldfarb@nvidia.com>
d3cbccdf · Michael Goldfarb · GitHub · 64126aa8 · d3cbccdf
Unverified Commit d3cbccdf authored Dec 04, 2024 by Michael Goldfarb Committed by GitHub Dec 04, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 2 deletions

tests/jax/test_distributed_fused_attn.py tests/jax/test_distributed_fused_attn.py +9 -2

No files found.
--- a/tests/jax/test_distributed_fused_attn.py
+++ b/tests/jax/test_distributed_fused_attn.py
@@ -341,8 +341,9 @@ class TestDistributedCrossAttn:
 @pytest.mark.parametrize(
    "data_shape",
    [
-        pytest.param([2, 512, 12, 128], id="2-512-12-128"),
+        # Sequence lengths will be scaled by CP so that we don't run with tiny sizes.
-        pytest.param([4, 1024, 16, 64], id="4-1024-16-64"),
+        pytest.param([2, 128, 12, 128], id="2-128xCP-12-128"),
+        pytest.param([4, 256, 16, 64], id="4-256xCP-16-64"),
    ],
 )
 @pytest.mark.parametrize("kv_groups", [1, 4, 8, 12, 16])
@@ -423,6 +424,12 @@ class TestDistributedContextParallelSelfAttn:
        qkv_format = get_qkv_format(qkv_layout)
        batch, seqlen, num_head, hidden = data_shape
+        # Scale the sequence length by 2*CP so its never too small as we scale up test.
+        # 2*CP is used since we split into two CP groups for load balancing.
+        seqlen = seqlen * cp_size * 2
+        data_shape = batch, seqlen, num_head, hidden
        num_kv_heads = num_head // kv_groups
        scaling_factor = 1.0 / np.sqrt(num_head)