Unverified Commit 1b1c01de authored by Jackmin801's avatar Jackmin801 Committed by GitHub
Browse files

[MoE] Move xpu moe to fused_moe/experts/ (#40568)


Signed-off-by: default avatarJackmin801 <ongjackm@gmail.com>
Co-authored-by: default avatarClaude <noreply@anthropic.com>
Co-authored-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent e9ba519f
...@@ -262,7 +262,7 @@ pull_request_rules: ...@@ -262,7 +262,7 @@ pull_request_rules:
- files~=^docker/Dockerfile.xpu - files~=^docker/Dockerfile.xpu
- files~=^\\.buildkite/intel_jobs/ - files~=^\\.buildkite/intel_jobs/
- files=\.buildkite/ci_config_intel.yaml - files=\.buildkite/ci_config_intel.yaml
- files=vllm/model_executor/layers/fused_moe/xpu_fused_moe.py - files=vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
- files=vllm/model_executor/kernels/linear/mixed_precision/xpu.py - files=vllm/model_executor/kernels/linear/mixed_precision/xpu.py
- files=vllm/model_executor/kernels/linear/mxfp8/xpu.py - files=vllm/model_executor/kernels/linear/mxfp8/xpu.py
- files=vllm/model_executor/kernels/linear/scaled_mm/xpu.py - files=vllm/model_executor/kernels/linear/scaled_mm/xpu.py
......
...@@ -85,6 +85,11 @@ if HAS_TRITON: ...@@ -85,6 +85,11 @@ if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
DeepGemmExperts, DeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
XPUExperts,
XPUExpertsFp8,
XPUExpertsMXFp4,
)
from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedTritonExperts, BatchedTritonExperts,
) )
...@@ -106,10 +111,6 @@ if HAS_TRITON: ...@@ -106,10 +111,6 @@ if HAS_TRITON:
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts, TritonOrDeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
XPUExperts,
XPUExpertsFp8,
)
__all__ += [ __all__ += [
"AiterExperts", "AiterExperts",
...@@ -129,6 +130,7 @@ if HAS_TRITON: ...@@ -129,6 +130,7 @@ if HAS_TRITON:
"TritonOrDeepGemmExperts", "TritonOrDeepGemmExperts",
"XPUExperts", "XPUExperts",
"XPUExpertsFp8", "XPUExpertsFp8",
"XPUExpertsMXFp4",
] ]
else: else:
# Some model classes directly use the custom ops. Add placeholders # Some model classes directly use the custom ops. Add placeholders
......
...@@ -180,7 +180,7 @@ def backend_to_kernel_cls( ...@@ -180,7 +180,7 @@ def backend_to_kernel_cls(
return [CutlassBatchedExpertsFp8] return [CutlassBatchedExpertsFp8]
elif backend == Fp8MoeBackend.XPU: elif backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
XPUExpertsFp8, XPUExpertsFp8,
) )
...@@ -470,7 +470,7 @@ def convert_to_fp8_moe_kernel_format( ...@@ -470,7 +470,7 @@ def convert_to_fp8_moe_kernel_format(
is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM), is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
) )
elif fp8_backend == Fp8MoeBackend.XPU: elif fp8_backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
prepare_fp8_moe_layer_for_xpu, prepare_fp8_moe_layer_for_xpu,
) )
......
...@@ -141,7 +141,7 @@ def backend_to_kernel_cls( ...@@ -141,7 +141,7 @@ def backend_to_kernel_cls(
return [AiterExperts] return [AiterExperts]
elif backend == Mxfp4MoeBackend.XPU: elif backend == Mxfp4MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExpertsMXFp4 from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
return [XPUExpertsMXFp4] return [XPUExpertsMXFp4]
......
...@@ -121,7 +121,7 @@ def backend_to_kernel_cls( ...@@ -121,7 +121,7 @@ def backend_to_kernel_cls(
return BatchedTritonExperts return BatchedTritonExperts
elif backend == UnquantizedMoeBackend.XPU: elif backend == UnquantizedMoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExperts from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExperts
return XPUExperts return XPUExperts
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment