[PyTorch] fix `test_current_device` test (#2398)

* fix test_current_device Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[PyTorch] fix `test_current_device` test (#2398)
* fix test_current_device Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
41fb9bcf · Charlene Yang · GitHub · 05bfa3f8 · 41fb9bcf
Unverified Commit 41fb9bcf authored Nov 18, 2025 by Charlene Yang Committed by GitHub Nov 18, 2025
Show whitespace changes
Inline Side-by-side

Showing with 50 additions and 10 deletions

tests/pytorch/distributed/test_sanity.py tests/pytorch/distributed/test_sanity.py +50 -10

No files found.
--- a/tests/pytorch/distributed/test_sanity.py
+++ b/tests/pytorch/distributed/test_sanity.py
@@ -55,7 +55,29 @@ def test_current_device(model, module):
            self_attn_mask_type="padding",
            device=f"cuda:{tensor_device}",
        )
-        num_tokens = torch.randint(0, config.max_seqlen_q, (1,)).item()
+        seqlens_q = torch.randint(
+            1,
+            config.max_seqlen_q,
+            [config.batch_size],
+            dtype=torch.int32,
+            device=f"cuda:{tensor_device}",
+        )
+        cu_seqlens_q = torch.zeros(
+            config.batch_size + 1, dtype=torch.int32, device=f"cuda:{tensor_device}"
+        )
+        cu_seqlens_q[1:] = torch.cumsum(seqlens_q, dim=0)
+        seqlens_kv = torch.randint(
+            1,
+            config.max_seqlen_kv,
+            [config.batch_size],
+            dtype=torch.int32,
+            device=f"cuda:{tensor_device}",
+        )
+        cu_seqlens_kv = torch.zeros(
+            config.batch_size + 1, dtype=torch.int32, device=f"cuda:{tensor_device}"
+        )
+        cu_seqlens_kv[1:] = torch.cumsum(seqlens_kv, dim=0)
+        num_tokens = cu_seqlens_q[-1]
        args = [
            torch.randn(
                (num_tokens, config.hidden_size),
@@ -64,9 +86,6 @@ def test_current_device(model, module):
                requires_grad=True,
            )
        ]
-        cu_seqlens_q, cu_seqlens_kv = [
-            torch.Tensor([0, 2, 3]).to(dtype=torch.int32, device=tensor_device) for _ in range(2)
-        ]
        kwargs["cu_seqlens_q"] = cu_seqlens_q
        kwargs["cu_seqlens_kv"] = cu_seqlens_kv
        kwargs["max_seqlen_q"] = config.max_seqlen_q
@@ -75,26 +94,47 @@ def test_current_device(model, module):
        model = DotProductAttention(
            config.num_heads, config.head_dim_qk, qkv_format="thd", attn_mask_type="padding"
        )
-        num_tokens = torch.randint(0, config.max_seqlen_q, (1,)).item()
+        seqlens_q = torch.randint(
+            1,
+            config.max_seqlen_q,
+            [config.batch_size],
+            dtype=torch.int32,
+            device=f"cuda:{tensor_device}",
+        )
+        cu_seqlens_q = torch.zeros(
+            config.batch_size + 1, dtype=torch.int32, device=f"cuda:{tensor_device}"
+        )
+        cu_seqlens_q[1:] = torch.cumsum(seqlens_q, dim=0)
+        seqlens_kv = torch.randint(
+            1,
+            config.max_seqlen_kv,
+            [config.batch_size],
+            dtype=torch.int32,
+            device=f"cuda:{tensor_device}",
+        )
+        cu_seqlens_kv = torch.zeros(
+            config.batch_size + 1, dtype=torch.int32, device=f"cuda:{tensor_device}"
+        )
+        cu_seqlens_kv[1:] = torch.cumsum(seqlens_kv, dim=0)
+        num_tokens = cu_seqlens_q[-1]
        args = [
            torch.randn(
                num_tokens,
                config.num_heads,
                config.head_dim_qk,
                dtype=dtype,
-                device=tensor_device,
+                device=f"cuda:{tensor_device}",
                requires_grad=True,
            )
            for _ in range(3)
        ]
-        cu_seqlens_q, cu_seqlens_kv = [
-            torch.Tensor([0, 2, 3]).to(dtype=torch.int32, device=tensor_device) for _ in range(2)
-        ]
        kwargs["cu_seqlens_q"] = cu_seqlens_q
        kwargs["cu_seqlens_kv"] = cu_seqlens_kv
        kwargs["max_seqlen_q"] = config.max_seqlen_q
        kwargs["max_seqlen_kv"] = config.max_seqlen_kv
-        bwd_args = [torch.randn(num_tokens, config.hidden_size, dtype=dtype, device=tensor_device)]
+        bwd_args = [
+            torch.randn(num_tokens, config.hidden_size, dtype=dtype, device=f"cuda:{tensor_device}")
+        ]
    elif module == "Linear":
        model = Linear(
            config.hidden_size,