add VLLM_USE_LIGHT_OP to optimize moe_align_block_size and moe_fused_gate

a54ab95d · zhuwenwen · ca034fd5 · a54ab95d · a54ab95d · a54ab95d
Commit a54ab95d authored Sep 07, 2025 by zhuwenwen
5 changed files
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -24,6 +24,7 @@ torch == 2.5.1
 triton == 3.0.0
 flash_attn == 2.6.1
 flash_mla == 1.0.0
+lightop == 0.5.0
 lmslim == 0.3.1
 numa
 python-multipart

--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -164,6 +164,7 @@ if TYPE_CHECKING:
    VLLM_USE_FLASH_ATTN_PA: bool = False
    VLLM_USE_APEX_RN: bool = False
    VLLM_USE_GLOBAL_CACHE13: bool = False
+    VLLM_USE_LIGHT_OP: bool = False

 def get_default_cache_root():
    return os.getenv(
@@ -1089,7 +1090,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_GLOBAL_CACHE13":
        lambda: (os.environ.get("VLLM_USE_GLOBAL_CACHE13", "False").lower() in
                 ("true", "1")),
-
+    # vLLM will use global cache for moe
+    "VLLM_USE_LIGHT_OP":
+        lambda: (os.environ.get("VLLM_USE_LIGHT_OP", "False").lower() in
+                 ("true", "1")),
 }

 # --8<-- [end:env-vars-definition]

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -42,6 +42,7 @@ from vllm.platforms.interface import CpuArchEnum

 from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
 from vllm import _custom_ops as ops
+from lightop import op

 if current_platform.is_cuda_alike():
    from .fused_batched_moe import BatchedTritonExperts
@@ -1277,7 +1278,18 @@ class FusedMoE(torch.nn.Module):
            assert topk_group is not None
            assert num_expert_group is not None
            if use_fused_gate:
-                topk_weights, topk_ids = ops.moe_fused_gate(
+                if envs.VLLM_USE_LIGHT_OP:
+                    topk_weights, topk_ids = op.moe_fused_gate(
+                        router_logits,
+                        e_score_correction_bias,
+                        num_expert_group,
+                        topk_group,
+                        top_k,
+                        routed_scaling_factor=routed_scaling_factor,
+                        n_share_experts_fusion=0,
+                    )
+                else:
+                    topk_weights, topk_ids = ops.moe_fused_gate(
                    router_logits,
                    e_score_correction_bias,
                    num_expert_group,

--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -8,6 +8,9 @@ from vllm import _custom_ops as ops
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, round_up

+import vllm.envs as envs
+from lightop import op
+

 @triton.jit
 def moe_align_block_size_stage1(
@@ -229,8 +232,12 @@ def moe_align_block_size(
                                      dtype=torch.int32,
                                      device=topk_ids.device)

-    ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+    if envs.VLLM_USE_LIGHT_OP:
+        op.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                expert_ids, num_tokens_post_pad)
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                expert_ids, num_tokens_post_pad)
    if expert_map is not None:
        expert_ids = expert_map[expert_ids]