for qwen3 dense,set VLLM_USE_OPT_RESHAPE_AND_CACHE=1

706e7a4f · zhuwenwen · a27f634a · 706e7a4f · 706e7a4f
Commit 706e7a4f authored Feb 10, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 1 deletion

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +8 -0

vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flash_attn.py +1 -1

No files found.
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -212,6 +212,10 @@ def _get_model_architecture(
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                    if not envs.is_set("VLLM_USE_FUSED_RMS_ROPE"):
                        os.environ['VLLM_USE_FUSED_RMS_ROPE'] = '1'
+                
+                if architectures in [['Qwen3ForCausalLM']]:
+                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
+                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'

            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):
@@ -241,6 +245,10 @@ def _get_model_architecture(
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
                    if not envs.is_set("VLLM_USE_FUSED_RMS_ROPE"):
                        os.environ['VLLM_USE_FUSED_RMS_ROPE'] = '1'
+                        
+                if architectures in [['Qwen3ForCausalLM']]:
+                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
+                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
            
            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -556,7 +556,7 @@ class FlashAttentionImpl(AttentionImpl):
                    layer._v_scale,
                )
            else:
-                if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE and key.dtype == value.dtype == torch.float16:
+                if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE:
                    from lightop import reshape_and_cache_cuda
                    reshape_and_cache_cuda(
                        key,