update VLLM_USE_OPT_RESHAPE_AND_CACHE to support bf16 and qwen3-dense

263f45a4 · zhuwenwen · ac28ab22 · 263f45a4 · 263f45a4
Commit 263f45a4 authored Feb 04, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +3 -0

vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flash_attn.py +1 -1

No files found.
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -211,6 +211,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                        os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
+                # if architectures in [['Qwen3ForCausalLM']]:
+                #     if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
+                #         os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '0'

            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -898,7 +898,7 @@ class FlashAttentionImpl(AttentionImpl):
        # op uses the slot_mapping's shape to determine the number of
        # actual tokens.
        if current_platform.is_rocm():
-            if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE and key.dtype == value.dtype == torch.float16:
+            if envs.VLLM_USE_OPT_RESHAPE_AND_CACHE:
                from lightop import reshape_and_cache_cuda
                reshape_and_cache_cuda(
                    key,