Unverified Commit 78c13e30 authored by Yong Hoon Shin's avatar Yong Hoon Shin Committed by GitHub
Browse files

[V1] Fix local chunked attention always disabled (#21419)


Signed-off-by: default avatarYong Hoon Shin <yhshin@meta.com>
parent 5c9b807b
......@@ -143,6 +143,8 @@ class Attention(nn.Module):
# the backends)
if envs.VLLM_USE_V1:
self.use_irope = extra_impl_args.pop("use_irope", False)
else:
self.use_irope = extra_impl_args.get("use_irope", False)
quant_method = quant_config.get_quant_method(
self, prefix=prefix) if quant_config else None
......@@ -177,7 +179,6 @@ class Attention(nn.Module):
kv_sharing_target_layer_name, **extra_impl_args)
self.backend = backend_name_to_enum(attn_backend.get_name())
self.dtype = dtype
self.use_irope = extra_impl_args.get("use_irope", False)
# For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
# torch.compile works by registering the attention as one giant
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment