Commit a921f34c authored by laibao's avatar laibao
Browse files

增加 VLLM_USE_FUSE_SILU_AND_MUL 环境变量,用来控制 fused MoE 里 silu+mul 的融合 kernel。

parent be22412f
...@@ -184,6 +184,7 @@ if TYPE_CHECKING: ...@@ -184,6 +184,7 @@ if TYPE_CHECKING:
VLLM_USE_PP_BALANCE: bool = False VLLM_USE_PP_BALANCE: bool = False
VLLM_USE_ZERO_MTP: bool = False VLLM_USE_ZERO_MTP: bool = False
VLLM_USE_CUDA_GRAPH_SIZES: bool = False VLLM_USE_CUDA_GRAPH_SIZES: bool = False
VLLM_USE_FUSE_SILU_AND_MUL: bool = True
def get_default_cache_root(): def get_default_cache_root():
return os.getenv( return os.getenv(
...@@ -1194,6 +1195,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1194,6 +1195,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_CUDA_GRAPH_SIZES": "VLLM_USE_CUDA_GRAPH_SIZES":
lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'True').lower() in lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'True').lower() in
("true", "1")), ("true", "1")),
# vLLM will use fused silu+mul kernel
"VLLM_USE_FUSE_SILU_AND_MUL":
lambda: (os.environ.get("VLLM_USE_FUSE_SILU_AND_MUL", "True").lower() in
("true", "1")),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
......
...@@ -1859,6 +1859,10 @@ def fused_experts_impl( ...@@ -1859,6 +1859,10 @@ def fused_experts_impl(
use_nn_moe=use_nn_moe) use_nn_moe=use_nn_moe)
if activation == "silu": if activation == "silu":
if envs.VLLM_USE_FUSE_SILU_AND_MUL:
from lightop import fuse_silu_and_mul
fuse_silu_and_mul(intermediate_cache1.view(-1, N),intermediate_cache2)
else:
torch.ops._C.silu_and_mul(intermediate_cache2, torch.ops._C.silu_and_mul(intermediate_cache2,
intermediate_cache1.view(-1, N)) intermediate_cache1.view(-1, N))
elif activation == "gelu": elif activation == "gelu":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment