Commit a54ab95d authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_LIGHT_OP to optimize moe_align_block_size and moe_fused_gate

parent ca034fd5
......@@ -24,6 +24,7 @@ torch == 2.5.1
triton == 3.0.0
flash_attn == 2.6.1
flash_mla == 1.0.0
lightop == 0.5.0
lmslim == 0.3.1
numa
python-multipart
......
......@@ -164,6 +164,7 @@ if TYPE_CHECKING:
VLLM_USE_FLASH_ATTN_PA: bool = False
VLLM_USE_APEX_RN: bool = False
VLLM_USE_GLOBAL_CACHE13: bool = False
VLLM_USE_LIGHT_OP: bool = False
def get_default_cache_root():
return os.getenv(
......@@ -1089,7 +1090,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_GLOBAL_CACHE13":
lambda: (os.environ.get("VLLM_USE_GLOBAL_CACHE13", "False").lower() in
("true", "1")),
# vLLM will use global cache for moe
"VLLM_USE_LIGHT_OP":
lambda: (os.environ.get("VLLM_USE_LIGHT_OP", "False").lower() in
("true", "1")),
}
# --8<-- [end:env-vars-definition]
......
......@@ -42,6 +42,7 @@ from vllm.platforms.interface import CpuArchEnum
from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
from vllm import _custom_ops as ops
from lightop import op
if current_platform.is_cuda_alike():
from .fused_batched_moe import BatchedTritonExperts
......@@ -1277,6 +1278,17 @@ class FusedMoE(torch.nn.Module):
assert topk_group is not None
assert num_expert_group is not None
if use_fused_gate:
if envs.VLLM_USE_LIGHT_OP:
topk_weights, topk_ids = op.moe_fused_gate(
router_logits,
e_score_correction_bias,
num_expert_group,
topk_group,
top_k,
routed_scaling_factor=routed_scaling_factor,
n_share_experts_fusion=0,
)
else:
topk_weights, topk_ids = ops.moe_fused_gate(
router_logits,
e_score_correction_bias,
......
......@@ -8,6 +8,9 @@ from vllm import _custom_ops as ops
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, round_up
import vllm.envs as envs
from lightop import op
@triton.jit
def moe_align_block_size_stage1(
......@@ -229,6 +232,10 @@ def moe_align_block_size(
dtype=torch.int32,
device=topk_ids.device)
if envs.VLLM_USE_LIGHT_OP:
op.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
expert_ids, num_tokens_post_pad)
else:
ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
expert_ids, num_tokens_post_pad)
if expert_map is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment