remove unused impl and envs

b5ea93d4 · zhuwenwen · bc60d70d · b5ea93d4 · b5ea93d4
Commit b5ea93d4 authored Feb 11, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 23 deletions

vllm/envs.py vllm/envs.py +0 -10

vllm/model_executor/layers/fused_moe/fuse_moe_w16a16_marlin.py ...model_executor/layers/fused_moe/fuse_moe_w16a16_marlin.py +5 -13

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -292,8 +292,6 @@ if TYPE_CHECKING:
    VLLM_USE_FUSED_FILL_RMS_CAT: bool = False
    VLLM_W8A8_BACKEND: int = 3
    VLLM_REJECT_SAMPLE_OPT: bool = False
-    VLLM_USE_OPT_MOE_SUM: bool = False
-    VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD: bool = False
    VLLM_USE_MOE_W16A16_TRITON: bool = False


@@ -1845,14 +1843,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_REJECT_SAMPLE_OPT":
        lambda: (os.getenv('VLLM_REJECT_SAMPLE_OPT', 'True').lower() in
                 ("true", "1")),
-    # vLLM will use triton moe_sum
-    "VLLM_USE_OPT_MOE_SUM":
-        lambda: (os.environ.get("VLLM_USE_OPT_MOE_SUM", "False").lower() in
-                 ("true", "1")),
-    # vLLM will use lightop moe_sum_mul_add
-    "VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD":
-        lambda: (os.environ.get("VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD", "False").lower() in
-                 ("true", "1")),
    # Force using Triton MoE path (disable Marlin W16A16 MoE).
    "VLLM_USE_MOE_W16A16_TRITON":
        lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in

--- a/vllm/model_executor/layers/fused_moe/fuse_moe_w16a16_marlin.py
+++ b/vllm/model_executor/layers/fused_moe/fuse_moe_w16a16_marlin.py
@@ -397,21 +397,13 @@ def fused_experts_impl_w16a16_marlin(hidden_states: torch.Tensor,
        )
        intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)

-        if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD: 
+        if envs.VLLM_USE_LIGHTOP_MOE_SUM:
            from lightop import op as op
            op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
-                    output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=shared_output[begin_chunk_idx:end_chunk_idx], 
-                    expert_mask=None, num_local_tokens=None, factor=routed_scaling_factor)
+                output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=None, 
+                expert_mask=None, num_local_tokens=None, factor=1.0)
        else:
-            if envs.VLLM_USE_LIGHTOP_MOE_SUM:
-                from lightop import op as op
-                op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
-                    output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=None, 
-                    expert_mask=None, num_local_tokens=None, factor=1.0)
-            elif envs.VLLM_USE_OPT_MOE_SUM:
-                moe_reduce_dispatch(intermediate_cache3.view(*intermediate_cache3.size()), out_hidden_states[begin_chunk_idx:end_chunk_idx], begin_chunk_idx, end_chunk_idx)
-            else:
-                ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
-                                out_hidden_states[begin_chunk_idx:end_chunk_idx])
+            ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
+                            out_hidden_states[begin_chunk_idx:end_chunk_idx])

    return out_hidden_states
\ No newline at end of file