update flash_mla_with_kvcache

set VLLM_USE_PIECEWISE=0

update flash_mla_with_kvcache
set VLLM_USE_PIECEWISE=0
fd8e4a76 · zhuwenwen · 1871c26c · fd8e4a76 · fd8e4a76
Commit fd8e4a76 authored Dec 24, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/attention/ops/flashmla.py vllm/attention/ops/flashmla.py +2 -2

vllm/envs.py vllm/envs.py +1 -1

No files found.
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -160,8 +160,8 @@ def flash_mla_with_kvcache(
    else:
        if current_platform.is_rocm():
            out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
-                q, k_cache, block_table, cache_seqlens, head_dim_v, tile_scheduler_metadata,
+                q, k_cache,  head_dim_v, cache_seqlens, block_table, softmax_scale, 
-                num_splits, softmax_scale, causal,  is_fp8_kvcache,
+                causal, tile_scheduler_metadata, num_splits, is_fp8_kvcache,
                indices)
        else:
            out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1657,7 +1657,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
                 ("true", "1")), 
    # vLLM will use piecewise
    "VLLM_USE_PIECEWISE":
-        lambda: (os.environ.get("VLLM_USE_PIECEWISE", "True").lower() in
+        lambda: (os.environ.get("VLLM_USE_PIECEWISE", "False").lower() in
                 ("true", "1")), 
    # vllm will use encoding_dsv32.py for dpsk-v32
    "VLLM_USE_V32_ENCODE":