Commit 16f88a8a authored by wangmin6's avatar wangmin6
Browse files

Merge branch 'v0.15.1-dev_lightop_fill_moe_align' into 'v0.15.1-dev'

moe: 补齐 fill+moe_align 融合开关语义

See merge request dcutoolkit/deeplearing/vllm!484
parents 5a14b60c 706c031c
...@@ -306,6 +306,7 @@ if TYPE_CHECKING: ...@@ -306,6 +306,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False
VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8 VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN: bool = False
VLLM_USE_CUDA_GRAPH_SIZES: bool = False VLLM_USE_CUDA_GRAPH_SIZES: bool = False
...@@ -1910,6 +1911,10 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1910,6 +1911,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
).lower() ).lower()
in ("true", "1") in ("true", "1")
), ),
# vLLM will use lightop fill + moe_align_block_size
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
("true", "1")),
#If set to 1/True, enable fuse split qkv+rmsnorm+rope+kv update just like glm4.7 moe attention. #If set to 1/True, enable fuse split qkv+rmsnorm+rope+kv update just like glm4.7 moe attention.
"VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE": "VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE":
......
...@@ -92,6 +92,8 @@ def moe_align_block_size( ...@@ -92,6 +92,8 @@ def moe_align_block_size(
sorted_ids = torch.empty( sorted_ids = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
) )
if not envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN:
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
if expert_map is not None: if expert_map is not None:
expert_ids = torch.zeros( expert_ids = torch.zeros(
...@@ -102,6 +104,7 @@ def moe_align_block_size( ...@@ -102,6 +104,7 @@ def moe_align_block_size(
(max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
) )
num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
is_fuse_fill = envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
if envs.VLLM_USE_LIGHTOP or expert_mask is not None: if envs.VLLM_USE_LIGHTOP or expert_mask is not None:
from lightop import op as op from lightop import op as op
...@@ -115,7 +118,7 @@ def moe_align_block_size( ...@@ -115,7 +118,7 @@ def moe_align_block_size(
expert_map = expert_map, expert_map = expert_map,
expert_mask = expert_mask, expert_mask = expert_mask,
num_local_tokens = None, num_local_tokens = None,
Is_fuse_fill = True, Is_fuse_fill = is_fuse_fill,
) )
else: else:
if envs.VLLM_USE_LIGHTOP_MOE_ALIGN: if envs.VLLM_USE_LIGHTOP_MOE_ALIGN:
...@@ -130,7 +133,7 @@ def moe_align_block_size( ...@@ -130,7 +133,7 @@ def moe_align_block_size(
expert_map = None, expert_map = None,
expert_mask = None, expert_mask = None,
num_local_tokens = None, num_local_tokens = None,
Is_fuse_fill = True, Is_fuse_fill = is_fuse_fill,
) )
else: else:
ops.moe_align_block_size( ops.moe_align_block_size(
......
...@@ -194,6 +194,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ...@@ -194,6 +194,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os.environ['VLLM_USE_OPT_CAT'] = '1' os.environ['VLLM_USE_OPT_CAT'] = '1'
if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"): if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1' os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}: # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"): # if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1' # os.environ['USE_FUSED_RMS_QUANT'] = '1'
...@@ -205,6 +208,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ...@@ -205,6 +208,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if architectures in [['Qwen3MoeForCausalLM']]: if architectures in [['Qwen3MoeForCausalLM']]:
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"): if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1' os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"): if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1' os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'
if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"): if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
...@@ -231,6 +236,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ...@@ -231,6 +236,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os.environ['VLLM_USE_OPT_CAT'] = '1' os.environ['VLLM_USE_OPT_CAT'] = '1'
if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"): if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1' os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}: # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"): # if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1' # os.environ['USE_FUSED_RMS_QUANT'] = '1'
...@@ -242,6 +250,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ...@@ -242,6 +250,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if architectures in [['Qwen3MoeForCausalLM']]: if architectures in [['Qwen3MoeForCausalLM']]:
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"): if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1' os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"): if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1' os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'
if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"): if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
...@@ -307,6 +317,7 @@ def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ...@@ -307,6 +317,7 @@ def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
model_config.runner_type, model_config.runner_type,
model_config.trust_remote_code, model_config.trust_remote_code,
model_config.model_impl, model_config.model_impl,
model_config.quantization,
tuple(getattr(model_config.hf_config, "architectures", [])), tuple(getattr(model_config.hf_config, "architectures", [])),
) )
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment