"vscode:/vscode.git/clone" did not exist on "dc7fb5bebe21657109672dba18f725753df93aac"
Unverified Commit 1a9353bb authored by Jackmin801's avatar Jackmin801 Committed by GitHub
Browse files

[MoE] Move GPT OSS Triton kernel experts into fused_moe/experts/ (#39007)


Signed-off-by: default avatarRobert Shaw <robertgshaw2@gmail.com>
Signed-off-by: default avatarJackmin801 <ongjackm@gmail.com>
Co-authored-by: default avatarRobert Shaw <robertgshaw2@gmail.com>
parent ecf5ff7c
......@@ -86,7 +86,7 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] |
| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
......
......@@ -25,7 +25,7 @@ from triton_kernels.tensor_details import layout
from triton_kernels.testing import assert_close
from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
triton_kernel_moe_forward,
)
from vllm.utils.math_utils import round_up
......
......@@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
OAITritonExperts,
UnfusedOAITritonExperts,
)
......
......@@ -29,18 +29,22 @@ class TestTritonMoeForwardExpertMap:
torch.tensor([0, -1, 1, -1], device=device) if expert_map_present else None
)
from vllm.utils.import_utils import import_triton_kernels
import_triton_kernels()
with (
patch("triton_kernels.topk.topk") as mock_topk,
patch(
"vllm.model_executor.layers.fused_moe."
"vllm.model_executor.layers.fused_moe.experts."
"gpt_oss_triton_kernels_moe.make_routing_data"
) as mock_make_routing,
patch(
"vllm.model_executor.layers.fused_moe."
"vllm.model_executor.layers.fused_moe.experts."
"gpt_oss_triton_kernels_moe.triton_kernel_fused_experts"
) as mock_fused_experts,
):
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
triton_kernel_moe_forward,
)
......
......@@ -19,6 +19,9 @@ from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import (
_get_config_dtype_str,
)
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (
UnfusedOAITritonExperts,
)
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
MarlinExperts,
)
......@@ -28,9 +31,6 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
FusedMoEModularMethod,
)
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
UnfusedOAITritonExperts,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEKernel,
)
......
......@@ -101,7 +101,7 @@ def backend_to_kernel_cls(
return [FlashInferExperts]
elif backend == Mxfp4MoeBackend.TRITON:
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
OAITritonExperts,
OAITritonMxfp4ExpertsMonolithic,
)
......@@ -110,7 +110,7 @@ def backend_to_kernel_cls(
return [OAITritonMxfp4ExpertsMonolithic, OAITritonExperts]
elif backend == Mxfp4MoeBackend.TRITON_UNFUSED:
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
UnfusedOAITritonExperts,
)
......
......@@ -1591,7 +1591,7 @@ class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod):
"EPLB not supported for `QuarkW4MXFp4MoEMethod_OSS` yet."
)
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import ( # noqa: E501
triton_kernel_moe_forward,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment