"docs/vscode:/vscode.git/clone" did not exist on "1ad69e8375e841095c2f682299be487fd9b8f47e"
Unverified Commit 4383f153 authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

[MoE] Move PF Methods to Folder (#35927)

parent 6eedec6e
......@@ -33,10 +33,10 @@ th {
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
| ------- | ------------------ | ------------ | ------------- | ----- | --------------------- | --------- |
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
| flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize.FlashInferNVLinkTwoSidedPrepareAndFinalize] |
| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize.FlashInferNVLinkOneSidedPrepareAndFinalize] |
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht.DeepEPHTPrepareAndFinalize] |
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll.DeepEPLLPrepareAndFinalize] |
| flashinfer_nvlink_two_sided | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferNVLinkTwoSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided.FlashInferNVLinkTwoSidedPrepareAndFinalize] |
| flashinfer_nvlink_one_sided | standard | nvfp4 | G,A,T | N | N | [`FlashInferNVLinkOneSidedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided.FlashInferNVLinkOneSidedPrepareAndFinalize] |
!!! info "Table key"
1. All types: mxfp4, nvfp4, int4, int8, fp8
......
......@@ -199,10 +199,10 @@ register_experts(
# Disable on blackwell for now
if has_deep_ep() and not current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import (
DeepEPLLPrepareAndFinalize,
)
......@@ -240,7 +240,7 @@ if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import ( # noqa: E501
FlashInferNVLinkTwoSidedPrepareAndFinalize,
)
......@@ -271,7 +271,7 @@ if (
and has_flashinfer_cutlass_fused_moe()
and current_platform.has_device_capability(100)
):
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize import ( # noqa: E501
from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided import ( # noqa: E501
FlashInferNVLinkOneSidedPrepareAndFinalize,
)
......
......@@ -19,10 +19,10 @@ from vllm.utils.import_utils import has_deep_ep
from vllm.utils.network_utils import get_open_port
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import (
DeepEPLLPrepareAndFinalize,
)
......
......@@ -37,10 +37,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_dummy_moe_config, make_test_weights
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import (
DeepEPLLPrepareAndFinalize,
)
......
......@@ -32,10 +32,10 @@ from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ht import (
DeepEPHTPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
from vllm.model_executor.layers.fused_moe.prepare_finalize.deepep_ll import (
DeepEPLLPrepareAndFinalize,
)
......
......@@ -15,12 +15,6 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEParallelConfig,
FusedMoEQuantConfig,
)
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_one_sided_prepare_finalize import ( # noqa: E501
FlashInferNVLinkOneSidedPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_nvlink_two_sided_prepare_finalize import ( # noqa: E501
FlashInferNVLinkTwoSidedPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEPrepareAndFinalize,
)
......@@ -28,6 +22,12 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
make_moe_prepare_and_finalize_naive_dp_ep,
make_moe_prepare_and_finalize_no_dp_ep,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_one_sided import ( # noqa: E501
FlashInferNVLinkOneSidedPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.prepare_finalize.flashinfer_nvlink_two_sided import ( # noqa: E501
FlashInferNVLinkTwoSidedPrepareAndFinalize,
)
from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep
......@@ -35,8 +35,8 @@ logger = init_logger(__name__)
if current_platform.is_cuda_alike():
if has_deep_ep():
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
from .deepep_ll_prepare_finalize import (
from .prepare_finalize.deepep_ht import DeepEPHTPrepareAndFinalize
from .prepare_finalize.deepep_ll import (
DEEPEP_QUANT_BLOCK_SHAPE,
DeepEPLLPrepareAndFinalize,
)
......
......@@ -19,4 +19,7 @@ __all__ = [
"MoEPrepareAndFinalizeNoDPEPMonolithic",
"MoEPrepareAndFinalizeNoDPEPModular",
"make_moe_prepare_and_finalize_no_dp_ep",
# deepep_ht, deepep_ll, and flashinfer_a2a are not
# imported here as they have optional dependencies (deep_ep, flashinfer).
# Import them directly from their modules as needed.
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment