Unverified Commit 1b1c01de authored by Jackmin801's avatar Jackmin801 Committed by GitHub
Browse files

[MoE] Move xpu moe to fused_moe/experts/ (#40568)


Signed-off-by: default avatarJackmin801 <ongjackm@gmail.com>
Co-authored-by: default avatarClaude <noreply@anthropic.com>
Co-authored-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent e9ba519f
......@@ -262,7 +262,7 @@ pull_request_rules:
- files~=^docker/Dockerfile.xpu
- files~=^\\.buildkite/intel_jobs/
- files=\.buildkite/ci_config_intel.yaml
- files=vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
- files=vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
- files=vllm/model_executor/kernels/linear/mixed_precision/xpu.py
- files=vllm/model_executor/kernels/linear/mxfp8/xpu.py
- files=vllm/model_executor/kernels/linear/scaled_mm/xpu.py
......
......@@ -85,6 +85,11 @@ if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
DeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
XPUExperts,
XPUExpertsFp8,
XPUExpertsMXFp4,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedTritonExperts,
)
......@@ -106,10 +111,6 @@ if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts,
)
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
XPUExperts,
XPUExpertsFp8,
)
__all__ += [
"AiterExperts",
......@@ -129,6 +130,7 @@ if HAS_TRITON:
"TritonOrDeepGemmExperts",
"XPUExperts",
"XPUExpertsFp8",
"XPUExpertsMXFp4",
]
else:
# Some model classes directly use the custom ops. Add placeholders
......
......@@ -180,7 +180,7 @@ def backend_to_kernel_cls(
return [CutlassBatchedExpertsFp8]
elif backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
XPUExpertsFp8,
)
......@@ -470,7 +470,7 @@ def convert_to_fp8_moe_kernel_format(
is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
)
elif fp8_backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
prepare_fp8_moe_layer_for_xpu,
)
......
......@@ -141,7 +141,7 @@ def backend_to_kernel_cls(
return [AiterExperts]
elif backend == Mxfp4MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExpertsMXFp4
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
return [XPUExpertsMXFp4]
......
......@@ -121,7 +121,7 @@ def backend_to_kernel_cls(
return BatchedTritonExperts
elif backend == UnquantizedMoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExperts
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExperts
return XPUExperts
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment