for qwen3 dense,set VLLM_USE_OPT_RESHAPE_AND_CACHE=1

39ff5a5a · zhuwenwen · a1abfaf3 · 39ff5a5a · 39ff5a5a
Commit 39ff5a5a authored Feb 11, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +6 -0

vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flash_attn.py +1 -1

No files found.
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -290,6 +290,9 @@ def get_model_architecture(
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                    if not envs.is_set("VLLM_USE_FUSED_RMS_ROPE"):
                        os.environ['VLLM_USE_FUSED_RMS_ROPE'] = '1'
+                if architectures in [['Qwen3ForCausalLM']]:
+                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
+                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                
            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):
@@ -340,6 +343,9 @@ def get_model_architecture(
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                    if not envs.is_set("VLLM_USE_FUSED_RMS_ROPE"):
                        os.environ['VLLM_USE_FUSED_RMS_ROPE'] = '1'
+                if architectures in [['Qwen3ForCausalLM']]:
+                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
+                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                 
            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -577,7 +577,7 @@ class FlashAttentionImpl(AttentionImpl):
                    layer._v_scale,
                )
            else:
-                if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE and key.dtype == value.dtype == torch.float16:
+                if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE:
                    from lightop import reshape_and_cache_cuda
                    reshape_and_cache_cuda(
                        key, value,