[BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298)

Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>

[BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
6c5f82e5 · Chendi.Xue · GitHub · b7f186bb · 6c5f82e5
Unverified Commit 6c5f82e5 authored Sep 19, 2025 by Chendi.Xue Committed by GitHub Sep 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/attention/layer.py vllm/attention/layer.py +6 -2

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op

 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
+try:
+    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, )
+except AttributeError:
+    tag_cudagraph_unsafe = ()  # type: ignore[assignment]


 def check_xformers_availability():
@@ -577,7 +581,7 @@ direct_register_custom_op(
    mutates_args=[],
    fake_impl=unified_attention_fake,
    dispatch_key=current_platform.dispatch_key,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
+    tags=tag_cudagraph_unsafe,
 )


@@ -628,5 +632,5 @@ direct_register_custom_op(
    mutates_args=["output", "output_block_scale"],
    fake_impl=unified_attention_with_output_fake,
    dispatch_key=current_platform.dispatch_key,
-    tags=(torch._C.Tag.cudagraph_unsafe, ),
+    tags=tag_cudagraph_unsafe,
 )