"vscode:/vscode.git/clone" did not exist on "e1098ced95146d98a4ed46c81ee709013d54fb1f"
Commit 11b94900 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove two_batch_overlap of moe and update use_mla

parent 6605af8e
......@@ -1200,9 +1200,6 @@ class FusedMoE(CustomOp):
self.quant_method.create_weights(layer=self, **moe_quant_params)
from vllm.two_batch_overlap.two_batch_overlap import tbo_all_reduce
self.tbo_all_reduce = tbo_all_reduce
# moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
self.use_fused_gate = envs.VLLM_ENABLE_MOE_FUSED_GATE \
and self.e_score_correction_bias is not None \
......
......@@ -3854,7 +3854,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
kv_cache_spec.page_size_bytes)
if isinstance(kv_cache_spec, AttentionSpec):
has_attn = True
if envs.VLLM_USE_FLASH_ATTN_PA and not kv_cache_spec.use_mla:
if envs.VLLM_USE_FLASH_ATTN_PA and not self.vllm_config.model_config.use_mla:
key_cache_shape, value_cache_shape = attn_backend.get_kv_cache_shape(
num_blocks,
kv_cache_spec.block_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment