remove two_batch_overlap of moe and update use_mla

11b94900 · zhuwenwen · 6605af8e · 11b94900 · 11b94900
Commit 11b94900 authored Oct 12, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 4 deletions

vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/layer.py +0 -3

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +1 -1

No files found.
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1200,9 +1200,6 @@ class FusedMoE(CustomOp):

        self.quant_method.create_weights(layer=self, **moe_quant_params)

-        from vllm.two_batch_overlap.two_batch_overlap import tbo_all_reduce
-        self.tbo_all_reduce = tbo_all_reduce
-
        # moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
        self.use_fused_gate = envs.VLLM_ENABLE_MOE_FUSED_GATE \
            and self.e_score_correction_bias is not None \

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3854,7 +3854,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                              kv_cache_spec.page_size_bytes)
                if isinstance(kv_cache_spec, AttentionSpec):
                    has_attn = True
-                    if envs.VLLM_USE_FLASH_ATTN_PA and not kv_cache_spec.use_mla:
+                    if envs.VLLM_USE_FLASH_ATTN_PA and not self.vllm_config.model_config.use_mla:
                        key_cache_shape, value_cache_shape = attn_backend.get_kv_cache_shape(
                            num_blocks,
                            kv_cache_spec.block_size,