Unverified Commit 1a9353bb authored by Jackmin801's avatar Jackmin801 Committed by GitHub
Browse files

[MoE] Move GPT OSS Triton kernel experts into fused_moe/experts/ (#39007)


Signed-off-by: default avatarRobert Shaw <robertgshaw2@gmail.com>
Signed-off-by: default avatarJackmin801 <ongjackm@gmail.com>
Co-authored-by: default avatarRobert Shaw <robertgshaw2@gmail.com>
parent ecf5ff7c
...@@ -86,7 +86,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k ...@@ -86,7 +86,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] | | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` | | rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
......
...@@ -25,7 +25,7 @@ from triton_kernels.tensor_details import layout ...@@ -25,7 +25,7 @@ from triton_kernels.tensor_details import layout
from triton_kernels.testing import assert_close from triton_kernels.testing import assert_close
from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
triton_kernel_moe_forward, triton_kernel_moe_forward,
) )
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
......
...@@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( ...@@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize, maybe_make_prepare_finalize,
) )
from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
OAITritonExperts, OAITritonExperts,
UnfusedOAITritonExperts, UnfusedOAITritonExperts,
) )
......
...@@ -29,18 +29,22 @@ class TestTritonMoeForwardExpertMap: ...@@ -29,18 +29,22 @@ class TestTritonMoeForwardExpertMap:
torch.tensor([0, -1, 1, -1], device=device) if expert_map_present else None torch.tensor([0, -1, 1, -1], device=device) if expert_map_present else None
) )
from vllm.utils.import_utils import import_triton_kernels
import_triton_kernels()
with ( with (
patch("triton_kernels.topk.topk") as mock_topk, patch("triton_kernels.topk.topk") as mock_topk,
patch( patch(
"vllm.model_executor.layers.fused_moe." "vllm.model_executor.layers.fused_moe.experts."
"gpt_oss_triton_kernels_moe.make_routing_data" "gpt_oss_triton_kernels_moe.make_routing_data"
) as mock_make_routing, ) as mock_make_routing,
patch( patch(
"vllm.model_executor.layers.fused_moe." "vllm.model_executor.layers.fused_moe.experts."
"gpt_oss_triton_kernels_moe.triton_kernel_fused_experts" "gpt_oss_triton_kernels_moe.triton_kernel_fused_experts"
) as mock_fused_experts, ) as mock_fused_experts,
): ):
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
triton_kernel_moe_forward, triton_kernel_moe_forward,
) )
......
...@@ -19,6 +19,9 @@ from vllm.model_executor.layers.fused_moe import FusedMoE ...@@ -19,6 +19,9 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
_get_config_dtype_str, _get_config_dtype_str,
) )
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
UnfusedOAITritonExperts,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
MarlinExperts, MarlinExperts,
) )
...@@ -28,9 +31,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( ...@@ -28,9 +31,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
FusedMoEModularMethod, FusedMoEModularMethod,
) )
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
UnfusedOAITritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEKernel, FusedMoEKernel,
) )
......
...@@ -101,7 +101,7 @@ def backend_to_kernel_cls( ...@@ -101,7 +101,7 @@ def backend_to_kernel_cls(
return [FlashInferExperts] return [FlashInferExperts]
elif backend == Mxfp4MoeBackend.TRITON: elif backend == Mxfp4MoeBackend.TRITON:
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
OAITritonExperts, OAITritonExperts,
OAITritonMxfp4ExpertsMonolithic, OAITritonMxfp4ExpertsMonolithic,
) )
...@@ -110,7 +110,7 @@ def backend_to_kernel_cls( ...@@ -110,7 +110,7 @@ def backend_to_kernel_cls(
return [OAITritonMxfp4ExpertsMonolithic, OAITritonExperts] return [OAITritonMxfp4ExpertsMonolithic, OAITritonExperts]
elif backend == Mxfp4MoeBackend.TRITON_UNFUSED: elif backend == Mxfp4MoeBackend.TRITON_UNFUSED:
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
UnfusedOAITritonExperts, UnfusedOAITritonExperts,
) )
......
...@@ -1591,7 +1591,7 @@ class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod): ...@@ -1591,7 +1591,7 @@ class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod):
"EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet." "EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet."
) )
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
triton_kernel_moe_forward, triton_kernel_moe_forward,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment