minor: zero workspace buffer init for flashinfer trtllm-gen attn (#22603)

1723ef1a · eigen · GitHub · 00d6cba0 · 1723ef1a · 1723ef1a
Unverified Commit 1723ef1a authored Aug 15, 2025 by eigen Committed by GitHub Aug 15, 2025
3 changed files
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -113,7 +113,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)

-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
        workspace_buffer,
        kv_layout,
@@ -247,7 +247,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
    kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
    kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)

-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
    wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
        workspace_buffer, kv_layout)
    wrapper.plan(q_indptr,

--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -203,7 +203,7 @@ class FlashInferState(AttentionState):

    def _get_workspace_buffer(self):
        if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                FLASHINFER_WORKSPACE_BUFFER_SIZE,
                dtype=torch.uint8,
                device=self.runner.device)

--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -252,7 +252,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):

    def _get_workspace_buffer(self):
        if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                FLASHINFER_WORKSPACE_BUFFER_SIZE,
                dtype=torch.uint8,
                device=self.device)