Commit 16f88a8a authored by wangmin6's avatar wangmin6
Browse files

Merge branch 'v0.15.1-dev_lightop_fill_moe_align' into 'v0.15.1-dev'

moe: 补齐 fill+moe_align 融合开关语义

See merge request dcutoolkit/deeplearing/vllm!484
parents 5a14b60c 706c031c
......@@ -306,6 +306,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False
VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN: bool = False
VLLM_USE_CUDA_GRAPH_SIZES: bool = False
......@@ -1910,6 +1911,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
).lower()
in ("true", "1")
),
# vLLM will use lightop fill + moe_align_block_size
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
("true", "1")),
#If set to 1/True, enable fuse split qkv+rmsnorm+rope+kv update just like glm4.7 moe attention.
"VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE":
......
......@@ -92,6 +92,8 @@ def moe_align_block_size(
sorted_ids = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
)
if not envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN:
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
if expert_map is not None:
expert_ids = torch.zeros(
......@@ -102,6 +104,7 @@ def moe_align_block_size(
(max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
)
num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
is_fuse_fill = envs.VLLM_USE_LIGHTOP_FILL_MOE_ALIGN
if envs.VLLM_USE_LIGHTOP or expert_mask is not None:
from lightop import op as op
......@@ -115,7 +118,7 @@ def moe_align_block_size(
expert_map = expert_map,
expert_mask = expert_mask,
num_local_tokens = None,
Is_fuse_fill = True,
Is_fuse_fill = is_fuse_fill,
)
else:
if envs.VLLM_USE_LIGHTOP_MOE_ALIGN:
......@@ -130,7 +133,7 @@ def moe_align_block_size(
expert_map = None,
expert_mask = None,
num_local_tokens = None,
Is_fuse_fill = True,
Is_fuse_fill = is_fuse_fill,
)
else:
ops.moe_align_block_size(
......
......@@ -194,6 +194,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os.environ['VLLM_USE_OPT_CAT'] = '1'
if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1'
......@@ -205,6 +208,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if architectures in [['Qwen3MoeForCausalLM']]:
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'
if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
......@@ -231,6 +236,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
os.environ['VLLM_USE_OPT_CAT'] = '1'
if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
# if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
# if not envs.is_set("USE_FUSED_RMS_QUANT"):
# os.environ['USE_FUSED_RMS_QUANT'] = '1'
......@@ -242,6 +250,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
if architectures in [['Qwen3MoeForCausalLM']]:
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'
if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
......@@ -307,6 +317,7 @@ def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
model_config.runner_type,
model_config.trust_remote_code,
model_config.model_impl,
model_config.quantization,
tuple(getattr(model_config.hf_config, "architectures", [])),
)
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment