Unverified Commit 079a4cf3 authored by Jackmin801's avatar Jackmin801 Committed by GitHub
Browse files

[MoE] Move cutlass moe to fused_moe/experts/ (#40574)


Signed-off-by: default avatarJackmin801 <ongjackm@gmail.com>
Co-authored-by: default avatarClaude <noreply@anthropic.com>
parent 9744b699
...@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( ...@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize, maybe_make_prepare_finalize,
) )
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
......
...@@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config, fp8_w8a8_moe_quant_config,
nvfp4_moe_quant_config, nvfp4_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4, CutlassExpertsFp4,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
......
...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( ...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize, maybe_make_prepare_finalize,
) )
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts, fused_experts,
fused_topk, fused_topk,
......
...@@ -83,8 +83,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k ...@@ -83,8 +83,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] | | deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] |
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] | | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
......
...@@ -367,7 +367,9 @@ else: ...@@ -367,7 +367,9 @@ else:
CutlassExpertsFp8 = None CutlassExpertsFp8 = None
if cutlass_fp4_supported(): if cutlass_fp4_supported():
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4,
)
register_experts( register_experts(
CutlassExpertsFp4, CutlassExpertsFp4,
......
...@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig, FusedMoEQuantConfig,
fp8_w8a8_moe_quant_config, fp8_w8a8_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp8, CutlassExpertsFp8,
run_cutlass_moe_fp8, run_cutlass_moe_fp8,
) )
......
...@@ -19,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( ...@@ -19,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize, maybe_make_prepare_finalize,
) )
from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4, CutlassExpertsFp4,
) )
from vllm.model_executor.layers.fused_moe.prepare_finalize import ( from vllm.model_executor.layers.fused_moe.prepare_finalize import (
......
...@@ -73,15 +73,15 @@ __all__ = [ ...@@ -73,15 +73,15 @@ __all__ = [
if HAS_TRITON: if HAS_TRITON:
# import to register the custom ops # import to register the custom ops
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassBatchedExpertsFp8, CutlassBatchedExpertsFp8,
CutlassExpertsFp8, CutlassExpertsFp8,
CutlassExpertsW4A8Fp8, CutlassExpertsW4A8Fp8,
cutlass_moe_w4a8_fp8, cutlass_moe_w4a8_fp8,
) )
from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
DeepGemmExperts, DeepGemmExperts,
) )
......
...@@ -173,7 +173,7 @@ def backend_to_kernel_cls( ...@@ -173,7 +173,7 @@ def backend_to_kernel_cls(
return [TritonOrCutlassExperts] return [TritonOrCutlassExperts]
elif backend == Fp8MoeBackend.BATCHED_VLLM_CUTLASS: elif backend == Fp8MoeBackend.BATCHED_VLLM_CUTLASS:
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassBatchedExpertsFp8, CutlassBatchedExpertsFp8,
) )
......
...@@ -107,7 +107,7 @@ def backend_to_kernel_cls( ...@@ -107,7 +107,7 @@ def backend_to_kernel_cls(
return [FlashInferCuteDSLBatchedExperts] return [FlashInferCuteDSLBatchedExperts]
elif backend == NvFp4MoeBackend.VLLM_CUTLASS: elif backend == NvFp4MoeBackend.VLLM_CUTLASS:
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsFp4, CutlassExpertsFp4,
) )
......
...@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
FusedMoEQuantConfig, FusedMoEQuantConfig,
) )
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
...@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import ( ...@@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig, FusedMoEQuantConfig,
mxfp4_moe_quant_config, mxfp4_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
CutlassExpertsMxfp4, CutlassExpertsMxfp4,
) )
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
...@@ -149,7 +149,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): ...@@ -149,7 +149,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
if self.use_cutlass_mxfp4: if self.use_cutlass_mxfp4:
# Swizzle weight scales from flat checkpoint layout [E, N, K//32] # Swizzle weight scales from flat checkpoint layout [E, N, K//32]
# to CUTLASS tiled layout [E, numMTiles*numKTiles*512]. # to CUTLASS tiled layout [E, numMTiles*numKTiles*512].
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
swizzle_mxfp4_scales, swizzle_mxfp4_scales,
) )
......
...@@ -315,7 +315,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): ...@@ -315,7 +315,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
) )
assert self.moe_quant_config is not None assert self.moe_quant_config is not None
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import (
cutlass_moe_w4a8_fp8, cutlass_moe_w4a8_fp8,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment