Commit 1217257c authored by zhuwenwen's avatar zhuwenwen
Browse files

fix run error

parent 8301427e
...@@ -226,14 +226,16 @@ def flash_mla_with_kvcache( ...@@ -226,14 +226,16 @@ def flash_mla_with_kvcache(
out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla( out, softmax_lse = flash_mla_cuda.fwd_kvcache_mla(
q, q,
k_cache, k_cache,
None,
head_dim_v, head_dim_v,
cache_seqlens, cache_seqlens,
block_table, block_table,
softmax_scale, softmax_scale,
causal, causal,
tile_scheduler_metadata, tile_scheduler_metadata,
num_splits) num_splits,
is_fp8_kvcache,
indices,
)
else: else:
out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla( out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
q, q,
......
...@@ -2062,8 +2062,6 @@ class FusedMoE(CustomOp): ...@@ -2062,8 +2062,6 @@ class FusedMoE(CustomOp):
router_logits=router_logits, router_logits=router_logits,
use_nn_moe=self.use_nn_moe, use_nn_moe=self.use_nn_moe,
use_fused_gate=self.use_fused_gate, use_fused_gate=self.use_fused_gate,
use_nn_moe=self.use_nn_moe,
use_fused_gate=self.use_fused_gate,
i_q=i_q, i_q=i_q,
i_s=i_s, i_s=i_s,
) )
......
...@@ -228,11 +228,10 @@ class RocmPlatform(Platform): ...@@ -228,11 +228,10 @@ class RocmPlatform(Platform):
logger.info_once("Using Sparse MLA backend on V1 engine.") logger.info_once("Using Sparse MLA backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path() return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
if use_mla: if attn_selector_config.use_mla:
# if use_sparse: # if attn_selector_config.use_sparse:
# logger.info_once("Using Sparse MLA backend on V1 engine.") # logger.info_once("Using Sparse MLA backend on V1 engine.")
# return ("vllm.v1.attention.backends.mla.flashmla_sparse." # return AttentionBackendEnum.FLASHMLA_SPARSE.get_path()
# "FlashMLASparseBackend")
use_flashmla = selected_backend == AttentionBackendEnum.FLASHMLA or envs.VLLM_USE_FLASH_MLA use_flashmla = selected_backend == AttentionBackendEnum.FLASHMLA or envs.VLLM_USE_FLASH_MLA
use_triton = selected_backend == AttentionBackendEnum.TRITON_MLA or ( use_triton = selected_backend == AttentionBackendEnum.TRITON_MLA or (
......
...@@ -56,6 +56,7 @@ from vllm.v1.attention.backends.utils import ( ...@@ -56,6 +56,7 @@ from vllm.v1.attention.backends.utils import (
get_kv_cache_layout, get_kv_cache_layout,
) )
from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.kv_cache_interface import AttentionSpec
import vllm.envs as envs
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -81,8 +81,8 @@ class Worker(WorkerBase): ...@@ -81,8 +81,8 @@ class Worker(WorkerBase):
) )
# configure float32 matmul precision according to vLLM env. # configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION # precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.backends.cuda.matmul.fp32_precision = precision # torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment