[WA] fix output data is nan in CI test "test_moe_eval_accuracy_large.py" (#7021)

Co-authored-by: wunhuang <wunhuang@amd.com> Co-authored-by: HAI <hixiao@gmail.com>

[WA] fix output data is nan in CI test "test_moe_eval_accuracy_large.py" (#7021)
Co-authored-by: wunhuang <wunhuang@amd.com> Co-authored-by: HAI <hixiao@gmail.com>
8ea7df61 · kk · GitHub · 4a102a2b · 8ea7df61
Unverified Commit 8ea7df61 authored Jun 11, 2025 by kk Committed by GitHub Jun 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 2 deletions

python/sglang/srt/layers/attention/aiter_backend.py python/sglang/srt/layers/attention/aiter_backend.py +15 -2

No files found.
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -717,6 +717,11 @@ class AiterIndicesUpdaterPrefill:
        self.req_to_token = model_runner.req_to_token_pool.req_to_token
        self.update = self.update_single_wrapper

+        # get the last index of the pool
+        self.pool_size = (
+            model_runner.token_to_kv_pool.size + model_runner.token_to_kv_pool.page_size
+        ) - 1
+
        self.kv_indices = None
        self.max_q_len = 0
        self.max_kv_len = 0
@@ -754,8 +759,16 @@ class AiterIndicesUpdaterPrefill:
            # Normal extend
            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
            kv_indptr = kv_indptr[: bs + 1]
-            kv_indices = torch.empty(
-                paged_kernel_lens_sum + 256,
+
+            # (TODO: Kk) WA - CI test_moe_eval_accuracy_large.py
+            # mha_batch_prefill reads 128 data to do computatoin
+            # if real data is not long enough then original padding value 0 is used
+            # but the 0 location will be made nan (noqa) in cuda graph capture mode
+            # this will cause the output tensor value becomes nan
+            # WA is to assure that last index of pool not changed
+            kv_indices = torch.full(
+                (paged_kernel_lens_sum + 128,),
+                self.pool_size,
                dtype=torch.int32,
                device=req_pool_indices.device,
            )