Commit a921f34c authored by laibao's avatar laibao
Browse files

增加 VLLM_USE_FUSE_SILU_AND_MUL 环境变量,用来控制 fused MoE 里 silu+mul 的融合 kernel。

parent be22412f
......@@ -184,6 +184,7 @@ if TYPE_CHECKING:
VLLM_USE_PP_BALANCE: bool = False
VLLM_USE_ZERO_MTP: bool = False
VLLM_USE_CUDA_GRAPH_SIZES: bool = False
VLLM_USE_FUSE_SILU_AND_MUL: bool = True
def get_default_cache_root():
return os.getenv(
......@@ -1194,6 +1195,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_CUDA_GRAPH_SIZES":
lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'True').lower() in
("true", "1")),
# vLLM will use fused silu+mul kernel
"VLLM_USE_FUSE_SILU_AND_MUL":
lambda: (os.environ.get("VLLM_USE_FUSE_SILU_AND_MUL", "True").lower() in
("true", "1")),
}
# --8<-- [end:env-vars-definition]
......
......@@ -1859,8 +1859,12 @@ def fused_experts_impl(
use_nn_moe=use_nn_moe)
if activation == "silu":
torch.ops._C.silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
if envs.VLLM_USE_FUSE_SILU_AND_MUL:
from lightop import fuse_silu_and_mul
fuse_silu_and_mul(intermediate_cache1.view(-1, N),intermediate_cache2)
else:
torch.ops._C.silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
elif activation == "gelu":
torch.ops._C.gelu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment