[V1] Fix local chunked attention always disabled (#21419)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>

[V1] Fix local chunked attention always disabled (#21419)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
78c13e30 · Yong Hoon Shin · GitHub · 5c9b807b · 78c13e30
Unverified Commit 78c13e30 authored Jul 23, 2025 by Yong Hoon Shin Committed by GitHub Jul 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

vllm/attention/layer.py vllm/attention/layer.py +2 -1

No files found.
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -143,6 +143,8 @@ class Attention(nn.Module):
        # the backends)
        if envs.VLLM_USE_V1:
            self.use_irope = extra_impl_args.pop("use_irope", False)
+        else:
+            self.use_irope = extra_impl_args.get("use_irope", False)

        quant_method = quant_config.get_quant_method(
            self, prefix=prefix) if quant_config else None
@@ -177,7 +179,6 @@ class Attention(nn.Module):
                             kv_sharing_target_layer_name, **extra_impl_args)
        self.backend = backend_name_to_enum(attn_backend.get_name())
        self.dtype = dtype
-        self.use_irope = extra_impl_args.get("use_irope", False)

        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
        # torch.compile works by registering the attention as one giant