Unverified Commit 31c29257 authored by Yongye Zhu's avatar Yongye Zhu Committed by GitHub
Browse files

[MoE Refactor][17/N] Apply Refactor to Bf16 (#31827)


Signed-off-by: default avatarYongye Zhu <zyy1102000@gmail.com>
Co-authored-by: default avatarRobert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
parent 8c11001b
model_name: "Qwen/Qwen3-30B-A3B"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
...@@ -8,3 +8,4 @@ Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml ...@@ -8,3 +8,4 @@ Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
Qwen3-30B-A3B-BF16-triton.yaml
model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
model_name: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
accuracy_threshold: 0.92
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
model_name: "mistralai/Mixtral-8x7B-v0.1"
accuracy_threshold: 0.58
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
model_name: "mistralai/Mixtral-8x7B-v0.1"
accuracy_threshold: 0.58
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
model_name: "Qwen/Qwen3-30B-A3B"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel"
env:
VLLM_USE_FLASHINFER_MOE_FP16: "1"
model_name: "Qwen/Qwen3-30B-A3B"
accuracy_threshold: 0.88
num_questions: 1319
num_fewshot: 5
server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2"
...@@ -11,3 +11,7 @@ Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml ...@@ -11,3 +11,7 @@ Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml
Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml
Llama-4-Scout-BF16-fi-cutlass.yaml
Llama-4-Scout-BF16-triton.yaml
Mixtral-8x7B-BF16-fi-cutlass.yaml
Mixtral-8x7B-BF16-triton.yaml
...@@ -11,3 +11,5 @@ Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml ...@@ -11,3 +11,5 @@ Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml
Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
Llama-4-Scout-Fp8-ModelOpt-marlin.yaml Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
Llama-4-Scout-Fp8-ModelOpt-triton.yaml Llama-4-Scout-Fp8-ModelOpt-triton.yaml
Qwen3-30B-A3B-BF16-fi-cutlass.yaml
Qwen3-30B-A3B-BF16-triton.yaml
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from enum import Enum
import torch
from torch.nn import Module
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._aiter_ops import rocm_aiter_ops
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
swap_w13_to_w31,
)
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
logger = init_logger(__name__)
class UnquantizedMoeBackend(Enum):
FLASHINFER_CUTLASS = "FlashInfer CUTLASS"
AITER = "ROCm AITER"
TRITON = "TRITON"
CPU = "CPU"
XPU = "XPU"
# NOTE(zyongye): Unsupported backend means backend
# that is not conform with Modular kernel format.
# We will directly call the kernel for those backend
UNSUPPORTED_BACKEND = [
UnquantizedMoeBackend.CPU,
UnquantizedMoeBackend.XPU,
]
def select_unquantized_moe_backend(
use_ep: bool,
use_dp: bool,
) -> UnquantizedMoeBackend:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
def _make_log_backend(backend: UnquantizedMoeBackend):
return f"Using {backend.value} backend for Unquantized MoE"
rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
flashinfer_cutlass_moe_enabled = (
has_flashinfer_cutlass_fused_moe()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and use_ep
and (not use_dp)
and current_platform.get_device_capability()[0] >= 9
)
if current_platform.is_rocm():
if rocm_aiter_moe_enabled:
backend = UnquantizedMoeBackend.AITER
else:
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_cuda():
if flashinfer_cutlass_moe_enabled:
backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
else:
if use_ep and (not use_dp):
logger.info_once(
"FlashInfer CUTLASS MoE is available for EP"
" but not enabled, consider setting"
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
scope="local",
)
elif use_dp:
logger.info_once(
"FlashInfer CUTLASS MoE is currently not available for DP.",
scope="local",
)
backend = UnquantizedMoeBackend.TRITON
if current_platform.is_xpu():
backend = UnquantizedMoeBackend.XPU
if current_platform.is_cpu():
backend = UnquantizedMoeBackend.CPU
logger.info_once(_make_log_backend(backend), scope="local")
return backend
def convert_to_unquantized_kernel_format(
unquantized_backend: UnquantizedMoeBackend,
layer: Module,
w13_weight: torch.Tensor | None = None,
w2_weight: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
if unquantized_backend == UnquantizedMoeBackend.AITER:
w13_weight, w2_weight = rocm_aiter_ops.shuffle_weights(
layer.w13_weight.data, layer.w2_weight.data
)
elif unquantized_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS:
# Swap halves to arrange as [w3; w1] (kernel expectation)
w13_weight = swap_w13_to_w31(layer.w13_weight.data)
return w13_weight, w2_weight
def make_unquantized_moe_kernel(
layer: torch.nn.Module,
backend: UnquantizedMoeBackend,
quant_config: FusedMoEQuantConfig,
moe_config: FusedMoEConfig,
) -> tuple[mk.FusedMoEModularKernel | None, bool]:
use_inplace = True
if backend in UNSUPPORTED_BACKEND:
return None, use_inplace
if backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS:
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
FlashInferExperts(
out_dtype=layer.params_dtype,
quant_config=quant_config,
tp_rank=moe_config.moe_parallel_config.tp_rank,
tp_size=moe_config.moe_parallel_config.tp_size,
ep_rank=moe_config.moe_parallel_config.ep_rank,
ep_size=moe_config.moe_parallel_config.ep_size,
),
)
use_inplace = False
elif backend == UnquantizedMoeBackend.AITER:
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
AiterExperts,
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
AiterExperts(quant_config),
)
elif backend == UnquantizedMoeBackend.TRITON:
from vllm.model_executor.layers.fused_moe import TritonExperts
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonExperts(quant_config),
)
return kernel, use_inplace
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn import Module
import vllm.envs as envs import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm._aiter_ops import rocm_aiter_ops
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
...@@ -16,9 +16,6 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -16,9 +16,6 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig, FusedMoEQuantConfig,
biased_moe_quant_config, biased_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
FusedMoEMethodBase, FusedMoEMethodBase,
) )
...@@ -28,19 +25,15 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( ...@@ -28,19 +25,15 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEPermuteExpertsUnpermute, FusedMoEPermuteExpertsUnpermute,
FusedMoEPrepareAndFinalize, FusedMoEPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.oracle.unquantized import (
MoEPrepareAndFinalizeNoEP, UnquantizedMoeBackend,
) convert_to_unquantized_kernel_format,
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( make_unquantized_moe_kernel,
AiterExperts, select_unquantized_moe_backend,
)
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
swap_w13_to_w31,
) )
from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.model_executor.utils import replace_parameter, set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.platforms.interface import CpuArchEnum from vllm.platforms.interface import CpuArchEnum
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
from .fused_batched_moe import BatchedTritonExperts from .fused_batched_moe import BatchedTritonExperts
...@@ -57,41 +50,13 @@ logger = init_logger(__name__) ...@@ -57,41 +50,13 @@ logger = init_logger(__name__)
class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
"""MoE method without quantization.""" """MoE method without quantization."""
# --8<-- [end:unquantized_fused_moe]
def __init__(self, moe: FusedMoEConfig): def __init__(self, moe: FusedMoEConfig):
super().__init__(moe) super().__init__(moe)
self.unquantized_backend = select_unquantized_moe_backend(
self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() use_ep=self.moe.moe_parallel_config.use_ep,
use_dp=self.moe.moe_parallel_config.dp_size > 1,
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
self.flashinfer_cutlass_moe_enabled = (
has_flashinfer_cutlass_fused_moe()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and self.moe.moe_parallel_config.use_ep
and self.moe.moe_parallel_config.dp_size == 1
and current_platform.get_device_capability()[0] >= 9
)
if self.flashinfer_cutlass_moe_enabled:
logger.info_once(
"Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod"
)
else:
if (
self.moe.moe_parallel_config.use_ep
and self.moe.moe_parallel_config.dp_size == 1
):
logger.info_once(
"FlashInfer CUTLASS MoE is available for EP"
" but not enabled, consider setting"
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
scope="local",
)
elif self.moe.moe_parallel_config.dp_size > 1:
logger.info_once(
"FlashInfer CUTLASS MoE is currently not available for DP.",
scope="local",
) )
self.kernel: mk.FusedMoEModularKernel | None = None
@property @property
def supports_eplb(self) -> bool: def supports_eplb(self) -> bool:
...@@ -105,7 +70,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -105,7 +70,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
self, self,
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> FusedMoEPrepareAndFinalize | None: ) -> FusedMoEPrepareAndFinalize | None:
if self.rocm_aiter_moe_enabled: if self.unquantized_backend == UnquantizedMoeBackend.AITER:
return None return None
else: else:
return super().maybe_make_prepare_finalize(routing_tables) return super().maybe_make_prepare_finalize(routing_tables)
...@@ -197,6 +162,33 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -197,6 +162,33 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
return weight return weight
def _setup_kernel(
self,
layer: Module,
w13: torch.Tensor,
w2: torch.Tensor,
) -> None:
# Shuffle weights to runtime format.
w13, w2 = convert_to_unquantized_kernel_format(
self.unquantized_backend,
layer=layer,
w13_weight=w13,
w2_weight=w2,
)
replace_parameter(layer, "w13_weight", w13)
replace_parameter(layer, "w2_weight", w2)
# Setup Modular Kernel for TP Case
self.moe_quant_config = self.get_fused_moe_quant_config(layer)
assert self.moe_quant_config is not None
self.kernel, self.use_inplace = make_unquantized_moe_kernel(
layer=layer,
backend=self.unquantized_backend,
quant_config=self.moe_quant_config,
moe_config=self.moe,
)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
super().process_weights_after_loading(layer) super().process_weights_after_loading(layer)
...@@ -204,7 +196,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -204,7 +196,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data) layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data) layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
if current_platform.is_xpu(): if self.unquantized_backend == UnquantizedMoeBackend.XPU:
import intel_extension_for_pytorch as ipex import intel_extension_for_pytorch as ipex
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
...@@ -214,7 +206,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -214,7 +206,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
use_prepack=True, use_prepack=True,
experts_start_id=ep_rank_start, experts_start_id=ep_rank_start,
) )
elif current_platform.is_cpu(): elif self.unquantized_backend == UnquantizedMoeBackend.CPU:
from vllm.model_executor.layers.fused_moe import cpu_fused_moe from vllm.model_executor.layers.fused_moe import cpu_fused_moe
if current_platform.get_cpu_architecture() == CpuArchEnum.X86: if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
...@@ -246,44 +238,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -246,44 +238,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
else: else:
layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
elif current_platform.is_cuda_alike(): elif current_platform.is_cuda_alike():
self.moe_quant_config = self.get_fused_moe_quant_config(layer) self._setup_kernel(
if self.rocm_aiter_moe_enabled: layer=layer,
shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( w13=layer.w13_weight,
layer.w13_weight.data, layer.w2_weight.data w2=layer.w2_weight,
)
replace_parameter(layer, "w13_weight", shuffled_w13)
replace_parameter(layer, "w2_weight", shuffled_w2)
self.use_inplace = True
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
AiterExperts(self.moe_quant_config),
shared_experts=None,
)
elif self.flashinfer_cutlass_moe_enabled:
self.use_inplace = False
# Swap halves to arrange as [w3; w1] (kernel expectation)
w13_weight = swap_w13_to_w31(layer.w13_weight.data)
replace_parameter(layer, "w13_weight", w13_weight)
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
FlashInferExperts(
out_dtype=layer.params_dtype,
quant_config=self.moe_quant_config,
tp_rank=self.moe.moe_parallel_config.tp_rank,
tp_size=self.moe.moe_parallel_config.tp_size,
ep_rank=self.moe.moe_parallel_config.ep_rank,
ep_size=self.moe.moe_parallel_config.ep_size,
),
)
else:
self.use_inplace = True
self.kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
TritonExperts(self.moe_quant_config),
shared_experts=None,
) )
def apply( def apply(
...@@ -316,6 +274,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -316,6 +274,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
x: torch.Tensor, x: torch.Tensor,
router_logits: torch.Tensor, router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert self.kernel
topk_weights, topk_ids = router.select_experts( topk_weights, topk_ids = router.select_experts(
hidden_states=x, hidden_states=x,
router_logits=router_logits, router_logits=router_logits,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment