Unverified Commit eb19955c authored by Tyler Michael Smith's avatar Tyler Michael Smith Committed by GitHub
Browse files

[WideEP] Remove pplx all2all backend (#33724)


Signed-off-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 0f2f24c8
......@@ -155,5 +155,14 @@ steps:
commands:
- pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
- pytest -v -s kernels/moe/test_deepep_moe.py
- pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
# - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
- label: Kernels Fp4 MoE Test (B200)
timeout_in_minutes: 60
device: b200
num_devices: 1
optional: true
commands:
- pytest -v -s kernels/moe/test_cutedsl_moe.py
- pytest -v -s kernels/moe/test_flashinfer_moe.py
- pytest -v -s kernels/moe/test_nvfp4_moe.py
- pytest -v -s kernels/moe/test_ocp_mx_moe.py
......@@ -725,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
......
......@@ -269,13 +269,13 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
const int64_t n, const int64_t k, const bool swap_ab);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
......
......@@ -263,12 +263,10 @@ void get_cutlass_moe_mm_data_caller(
}
template <bool SWAP_AB>
__global__ void compute_pplx_data(int32_t* expert_offsets,
int32_t* problem_sizes1,
int32_t* problem_sizes2,
const int32_t* __restrict__ expert_num_tokens,
const int padded_m, const int n,
const int k) {
__global__ void compute_batched_moe_data(
int32_t* expert_offsets, int32_t* problem_sizes1, int32_t* problem_sizes2,
const int32_t* __restrict__ expert_num_tokens, const int padded_m,
const int n, const int k) {
int expert_idx = threadIdx.x;
expert_offsets[expert_idx] = expert_idx * padded_m;
......@@ -289,24 +287,22 @@ __global__ void compute_pplx_data(int32_t* expert_offsets,
}
}
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k) {
void get_cutlass_batched_moe_mm_data_caller(
torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
const int64_t k) {
auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
compute_batched_moe_data<false><<<1, num_local_experts, 0, stream>>>(
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()),
static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
k);
} else {
compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
compute_batched_moe_data<true><<<1, num_local_experts, 0, stream>>>(
static_cast<int32_t*>(expert_offsets.data_ptr()),
static_cast<int32_t*>(problem_sizes1.data_ptr()),
static_cast<int32_t*>(problem_sizes2.data_ptr()),
......
......@@ -82,13 +82,11 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller(
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
const int64_t n, const int64_t k, const bool swap_ab);
void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m,
const int64_t n, const int64_t k);
void get_cutlass_batched_moe_mm_data_caller(
torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2, const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts, const int64_t padded_m, const int64_t n,
const int64_t k);
#endif
void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
......@@ -319,29 +317,30 @@ void get_cutlass_moe_mm_problem_sizes_from_expert_offsets(
version_num, ". Required capability: 90, 100, or 120");
}
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k) {
void get_cutlass_batched_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k) {
// This function currently gets compiled only if we have a valid cutlass moe
// mm to run it for.
int32_t version_num = get_sm_version_num();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
get_cutlass_batched_moe_mm_data_caller(expert_offsets, problem_sizes1,
problem_sizes2, expert_num_tokens,
num_local_experts, padded_m, n, k);
return;
#endif
TORCH_CHECK_NOT_IMPLEMENTED(
false,
"No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num, ". Required capability: 90, 100, or 120");
TORCH_CHECK_NOT_IMPLEMENTED(false,
"No compiled get_cutlass_batched_moe_mm_data: no "
"cutlass_scaled_mm kernel "
"for CUDA device capability: ",
version_num,
". Required capability: 90, 100, or 120");
}
void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
......
......@@ -489,19 +489,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
&get_cutlass_moe_mm_problem_sizes_from_expert_offsets);
// A function that computes data required to run fused MoE with w8a8 grouped
// GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
// GEMM in batched expert format. It takes expert_num_tokens
// as an input, and computes expert_offsets (token start indices of each
// expert). In addition to this, it computes problem sizes for each expert's
// multiplication used by the two mms called from fused MoE operation.
ops.def(
"get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
"get_cutlass_batched_moe_mm_data(Tensor! expert_offsets, "
" Tensor! problem_sizes1, "
" Tensor! problem_sizes2, "
" Tensor expert_num_tokens, "
" int num_local_experts, int padded_m, "
" int n, int k) -> ()");
ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
&get_cutlass_pplx_moe_mm_data);
ops.impl("get_cutlass_batched_moe_mm_data", torch::kCUDA,
&get_cutlass_batched_moe_mm_data);
// Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
ops.def(
......
......@@ -308,7 +308,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
#################### CSRC BUILD IMAGE ####################
#################### EXTENSIONS BUILD IMAGE ####################
# Build DeepGEMM, pplx-kernels, DeepEP - runs in PARALLEL with csrc-build
# Build DeepGEMM, DeepEP - runs in PARALLEL with csrc-build
# This stage is independent and doesn't affect csrc cache
FROM base AS extensions-build
ARG CUDA_VERSION
......@@ -335,10 +335,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Ensure the wheel dir exists so COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
# Build pplx-kernels and DeepEP wheels
# Build DeepEP wheels
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management
ARG PPLX_COMMIT_HASH=12cecfd
ARG DEEPEP_COMMIT_HASH=73b6ea4
ARG NVSHMEM_VER
RUN --mount=type=cache,target=/root/.cache/uv \
......@@ -347,7 +346,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \
${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
......@@ -676,7 +674,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
# Install EP kernels wheels (DeepEP) that have been built in the `build` stage
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system ep_kernels/dist/*.whl --verbose \
......
......@@ -52,9 +52,6 @@
"DEEPGEMM_GIT_REF": {
"default": "477618cd51baffca09c4b0b87e97c03fe827ef03"
},
"PPLX_COMMIT_HASH": {
"default": "12cecfd"
},
"DEEPEP_COMMIT_HASH": {
"default": "73b6ea4"
},
......
......@@ -15,7 +15,7 @@ Based on the format of the input activations, FusedMoE implementations are broad
The input activation format completely depends on the All2All Dispatch being used.
* In the Contiguous variant, the All2All Dispatch returns the activations as a contiguous tensor of shape (M, K) along with TopK Ids and TopK weights of shape (M, num_topk). Look at `DeepEPHTPrepareAndFinalize` for an example.
* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `PplxPrepareAndFinalize` or `DeepEPLLPrepareAndFinalize` for an example.
* In the Batched variant, the All2All Dispatch returns the activations as a tensor of shape (num_experts, max_tokens, K). Here, the activations/tokens that subscribe to the same expert are batched together. Note that not all entries of the tensor are valid. The activations tensor is typically accompanied by an `expert_num_tokens` tensor of size `num_experts`, where `expert_num_tokens[i]` indicates the number of valid tokens that subscribe to the ith expert. Look at `DeepEPLLPrepareAndFinalize` for an example.
The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
......@@ -132,7 +132,6 @@ class FusedMoEModularKernel:
Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & Combine implementation / kernel. For example,
* PplxPrepareAndFinalize type is backed by Pplx All2All kernels,
* DeepEPHTPrepareAndFinalize type is backed by DeepEP High-Throughput All2All kernels, and
* DeepEPLLPrepareAndFinalize type is backed by DeepEP Low-Latency All2All kernels.
......@@ -229,7 +228,7 @@ Doing this will add the new implementation to the test suite.
### How To Check `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` Compatibility
The unit test file [test_modular_kernel_combinations.py](../../tests/kernels/moe/test_modular_kernel_combinations.py) can also be executed as a standalone script.
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
Example: `python3 -m tests.kernels.moe.test_modular_kernel_combinations --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
As a side effect, this script can be used to test `FusedMoEPrepareAndFinalize` & `FusedMoEPermuteExpertsUnpermute` compatibility. When invoked
with incompatible types, the script will error.
......@@ -238,7 +237,7 @@ with incompatible types, the script will error.
Please take a look at [profile_modular_kernel.py](../../tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py)
The script can be used to generate Torch traces for a single `FusedMoEModularKernel::forward()` call for any compatible
`FusedMoEPrepareAndFinalize` and `FusedMoEPermuteExpertsUnpermute` types.
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts`
Example: `python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel --pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts`
## FusedMoEPrepareAndFinalize Implementations
......
......@@ -33,7 +33,6 @@ th {
| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE] |
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferA2APrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize.FlashInferA2APrepareAndFinalize] |
......@@ -68,7 +67,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `DeepEPLLPrepareAndFinalize`.
Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
......@@ -110,5 +109,5 @@ The following table shows "families" of modular kernels that are intended to wor
| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
|---------|-----------------------------------------|----------------------------------------------|
| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
| deepep_low_latency | `DeepEPLLPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
......@@ -154,7 +154,7 @@ If you have PRs touching the area, please feel free to ping the area owner for r
- FlashAttention: @LucasWilkinson
- FlashInfer: @LucasWilkinson, @mgoin, @WoosukKwon
- Blackwell Kernels: @mgoin, @yewentao256
- DeepEP/DeepGEMM/pplx: @mgoin, @yewentao256
- DeepEP/DeepGEMM: @mgoin, @yewentao256
### Integrations
......
......@@ -8,7 +8,7 @@ EP is typically coupled with Data Parallelism (DP). While DP can be used indepen
Before using EP, you need to install the necessary dependencies. We are actively working on making this easier in the future:
1. **Install DeepEP and pplx-kernels**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
1. **Install DeepEP**: Set up host environment following vLLM's guide for EP kernels [here](../../tools/ep_kernels).
2. **Install DeepGEMM library**: Follow the [official instructions](https://github.com/deepseek-ai/DeepGEMM#installation).
3. **For disaggregated serving**: Install `gdrcopy` by running the [`install_gdrcopy.sh`](../../tools/install_gdrcopy.sh) script (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/).
......@@ -19,7 +19,6 @@ vLLM provides multiple communication backends for EP. Use `--all2all-backend` to
| Backend | Use Case | Features | Best For |
|---------|----------|----------|----------|
| `allgather_reducescatter` | Default backend | Standard all2all using allgather/reducescatter primitives | General purpose, works with any EP+DP configuration |
| `pplx` | Single node | Chunked prefill support, efficient intra-node communication | Single-node deployments, development |
| `deepep_high_throughput` | Multi-node prefill | Grouped GEMM with continuous layout, optimized for prefill | Prefill-dominated workloads, high-throughput scenarios |
| `deepep_low_latency` | Multi-node decode | CUDA graph support, masked layout, optimized for decode | Decode-dominated workloads, low-latency scenarios |
| `flashinfer_all2allv` | MNNVL systems | FlashInfer alltoallv kernels for multi-node NVLink | Systems with NVLink across nodes |
......@@ -71,12 +70,11 @@ For example, with `TP=2, DP=4` (8 GPUs total):
The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
```bash
# Single node EP deployment with pplx backend
# Single node EP deployment
vllm serve deepseek-ai/DeepSeek-V3-0324 \
--tensor-parallel-size 1 \ # Tensor parallelism across 1 GPU
--data-parallel-size 8 \ # Data parallelism across 8 processes
--enable-expert-parallel \ # Enable expert parallelism
--all2all-backend pplx # Use pplx communication backend
--enable-expert-parallel # Enable expert parallelism
```
## Multi-Node Deployment
......@@ -197,7 +195,6 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
--tensor-parallel-size 1 \ # Tensor parallelism
--data-parallel-size 8 \ # Data parallelism
--enable-expert-parallel \ # Enable EP
--all2all-backend pplx \ # Use pplx communication backend
--enable-eplb \ # Enable load balancer
--eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
```
......
......@@ -64,7 +64,7 @@ vllm serve "$MODEL_NAME" \
--enforce-eager \
--enable-expert-parallel \
--enable-eplb \
--all2all-backend pplx \
--all2all-backend allgather_reducescatter \
--num-redundant-experts "$REDUNDANT_EXPERTS" \
--trust-remote-code \
--host "$HOST" \
......
......@@ -37,7 +37,6 @@ from vllm.utils.import_utils import (
has_deep_ep,
has_deep_gemm,
has_mori,
has_pplx,
)
from .mk_objects import (
......@@ -206,10 +205,6 @@ class Config:
info = expert_info(self.fused_experts_type)
return info.needs_deep_gemm
def needs_pplx(self):
info = prepare_finalize_info(self.prepare_finalize_type)
return info.backend == "pplx"
def needs_deep_ep(self):
info = prepare_finalize_info(self.prepare_finalize_type)
return (
......@@ -290,8 +285,6 @@ class Config:
return False, "Needs DeepEP, but DeepEP not available."
if self.needs_deep_gemm() and not has_deep_gemm():
return False, "Needs DeepGEMM, but DeepGEMM not available."
if self.needs_pplx() and not has_pplx(): # noqa: SIM103
return False, "Needs PPLX, but PPLX not available."
if self.needs_aiter() and not has_aiter(): # noqa: SIM103
return False, "Needs Aiter, but Aiter not available."
if self.needs_mori() and not has_mori(): # noqa: SIM103
......
......@@ -39,7 +39,6 @@ from vllm.utils.import_utils import (
has_deep_ep,
has_deep_gemm,
has_mori,
has_pplx,
)
......@@ -238,19 +237,6 @@ if has_mori():
supports_apply_weight_on_input=False,
)
if has_pplx():
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
PplxPrepareAndFinalize,
)
register_prepare_and_finalize(
PplxPrepareAndFinalize,
batched_format,
common_float_and_int_types,
blocked_quantization_support=True,
backend="pplx",
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
......
......@@ -125,7 +125,7 @@ if __name__ == "__main__":
description=(
"Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.modular_kernel_tools.profile_modular_kernel " # noqa: E501
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
"--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
)
)
args = parser.parse_args()
......
......@@ -14,7 +14,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager
......@@ -39,12 +39,12 @@ from .modular_kernel_tools.parallel_utils import (
)
has_any_multi_gpu_package = (
has_deep_ep() or has_deep_gemm() or has_pplx() or has_flashinfer_cutlass_fused_moe()
has_deep_ep() or has_deep_gemm() or has_flashinfer_cutlass_fused_moe()
)
meets_multi_gpu_requirements = pytest.mark.skipif(
not has_any_multi_gpu_package,
reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
reason="Requires deep_ep or deep_gemm or flashinfer packages",
)
if current_platform.is_fp8_fnuz():
......@@ -341,7 +341,7 @@ if __name__ == "__main__":
description=(
"Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
"--pf-type DeepEPLLPrepareAndFinalize --experts-type BatchedTritonExperts"
)
)
args = parser.parse_args()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.kernels.utils import torch_experts
from vllm import _custom_ops as ops
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
RoutingMethodType,
fp8_w8a8_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassBatchedExpertsFp8
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
try:
from pplx_kernels import AllToAll
from pplx_kernels.nvshmem import (
nvshmem_alloc_empty_unique_id,
nvshmem_finalize,
nvshmem_get_unique_id,
nvshmem_init,
)
has_pplx = True
except ImportError:
has_pplx = False
requires_pplx = pytest.mark.skipif(
not has_pplx,
reason="Requires PPLX kernels",
)
NUM_EXPERTS = [40, 64]
TOP_KS = [6, 8]
def rank_chunk(num, r, w):
rem = num % w
return (num // w) + (1 if r < rem else 0)
def chunk_by_rank(t, r, w):
num = t.shape[0]
chunk = rank_chunk(num, r, w)
rem = num % w
if rem == 0 or r < rem:
return t[(r * chunk) : (r + 1) * chunk].contiguous()
else:
long_chunks = (num // w + 1) * rem
short_chunks = (r - rem) * chunk
start = long_chunks + short_chunks
return t[start : start + chunk].contiguous()
def pplx_cutlass_moe(
pgi: ProcessGroupInfo,
dp_size: int,
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
a1_scale: torch.Tensor,
out_dtype,
per_act_token: bool,
per_out_ch: bool,
group_name: str | None,
):
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
PplxPrepareAndFinalize,
)
init_workspace_manager(torch.cuda.current_device())
assert torch.cuda.current_device() == pgi.local_rank
num_tokens, hidden_dim = a.shape
intermediate_dim = w2.shape[2]
num_experts = w1.shape[0]
block_size = hidden_dim # TODO support more cases
device = pgi.device
rank = pgi.rank
world_size = pgi.world_size
rank_num_tokens = rank_chunk(num_tokens, rank, world_size)
max_num_tokens = rank_chunk(num_tokens, 0, world_size)
topk = topk_ids.shape[1]
if block_size == hidden_dim:
scale_elems = 4 # hack to circumvent pplx data format requirements
else:
scale_elems = (hidden_dim + block_size - 1) // block_size
args = dict(
max_num_tokens=max_num_tokens,
num_experts=num_experts,
experts_per_token=topk,
rank=rank,
world_size=world_size,
dp_size=dp_size,
hidden_dim=hidden_dim,
hidden_dim_bytes=hidden_dim, # because a.dtype.itemsize == 1
hidden_dim_scale_bytes=scale_elems * torch.float32.itemsize,
)
if group_name is None:
ata = AllToAll.internode(**args)
else:
args["group_name"] = group_name
ata = AllToAll.intranode(**args)
w1 = w1.to(device)
w2 = w2.to(device)
w1_scale = w1_scale.to(device)
w2_scale = w2_scale.to(device)
a1_scale = a1_scale.to(device)
assert num_experts % world_size == 0
num_local_experts = cdiv(num_experts, world_size)
num_dispatchers = pgi.world_size // dp_size
prepare_finalize = PplxPrepareAndFinalize(
ata,
max_num_tokens=max_num_tokens,
num_local_experts=num_local_experts,
num_dispatchers=num_dispatchers,
)
def make_moe_config() -> FusedMoEConfig:
return FusedMoEConfig(
num_experts=num_experts,
experts_per_token=topk,
hidden_dim=hidden_dim,
intermediate_size_per_partition=intermediate_dim,
num_local_experts=num_local_experts,
num_logical_experts=num_experts,
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
activation=MoEActivation.SILU,
in_dtype=torch.bfloat16,
device="cuda",
routing_method=RoutingMethodType.Llama4,
)
experts = CutlassBatchedExpertsFp8(
moe_config=make_moe_config(),
quant_config=fp8_w8a8_moe_quant_config(
per_act_token_quant=per_act_token,
per_out_ch_quant=per_out_ch,
w1_scale=chunk_by_rank(w1_scale, rank, world_size),
w2_scale=chunk_by_rank(w2_scale, rank, world_size),
a1_scale=chunk_by_rank(a1_scale, rank, world_size)
if per_act_token
else a1_scale[rank],
),
max_num_tokens=max_num_tokens,
num_dispatchers=num_dispatchers,
)
fused_cutlass_experts = FusedMoEModularKernel(
prepare_finalize,
experts,
inplace=False,
)
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
chunk_topk_weight = chunk_by_rank(topk_weights, rank, world_size).to(device)
chunk_topk_ids = (
chunk_by_rank(topk_ids, rank, world_size).to(torch.uint32).to(device)
)
out = fused_cutlass_experts(
a_chunk,
chunk_by_rank(w1, rank, world_size),
chunk_by_rank(w2, rank, world_size),
chunk_topk_weight,
chunk_topk_ids,
global_num_experts=num_experts,
expert_map=None, # TODO
)
torch.cuda.synchronize()
ata.destroy()
return out[:rank_num_tokens]
vllm_config = VllmConfig()
def _pplx_moe(
pgi: ProcessGroupInfo,
dp_size: int,
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
a1_scale: torch.Tensor,
out_dtype,
a_full: torch.Tensor,
w1_full: torch.Tensor,
w2_full: torch.Tensor,
per_act_token: bool,
per_out_ch: bool,
use_internode: bool,
):
try:
if use_internode:
uid = (
nvshmem_get_unique_id()
if pgi.rank == 0
else nvshmem_alloc_empty_unique_id()
)
torch.distributed.broadcast(uid, src=0)
nvshmem_init(uid, pgi.rank, pgi.world_size)
else:
group_ranks = list(range(pgi.world_size))
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
group_name = cpu_group.group_name
with set_current_vllm_config(vllm_config):
torch_output = torch_experts(
a_full, w1_full, w2_full, topk_weights, topk_ids
)
pplx_output = pplx_cutlass_moe(
pgi,
dp_size,
a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
a1_scale,
out_dtype,
per_act_token,
per_out_ch,
group_name,
)
torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
pplx_output.device
)
# Uncomment if more debugging is needed
# print("PPLX OUT:", pplx_output)
# print("TORCH OUT:", torch_output)
torch.testing.assert_close(pplx_output, torch_output, atol=0.05, rtol=0)
finally:
if use_internode:
nvshmem_finalize()
@pytest.mark.parametrize("m", [2, 224])
@pytest.mark.parametrize("n", [3072])
@pytest.mark.parametrize("k", [1536])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) # , [4, 2]])
@pytest.mark.parametrize("use_internode", [False])
@multi_gpu_test(num_gpus=2)
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()
),
reason="Grouped gemm is not supported on this GPU type.",
)
@requires_pplx
def test_cutlass_moe_pplx(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
world_dp_size: tuple[int, int],
use_internode: bool,
):
set_random_seed(7)
with set_current_vllm_config(vllm_config):
dtype = torch.half
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10.0
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10.0
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10.0
n_b_scales = 2 * n if per_out_ch else 1
k_b_scales = k if per_out_ch else 1
w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float8_e4m3fn)
w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
w1_scale = torch.empty((e, n_b_scales, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((e, k_b_scales, 1), device="cuda", dtype=torch.float32)
for expert in range(e):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
w1[expert], use_per_token_if_dynamic=per_out_ch
)
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
w2[expert], use_per_token_if_dynamic=per_out_ch
)
w1_d = torch.empty_like(w1)
w2_d = torch.empty_like(w2)
for expert in range(e):
w1_d[expert] = (w1_q[expert].float() * w1_scale[expert]).half()
w2_d[expert] = (w2_q[expert].float() * w2_scale[expert]).half()
score = torch.randn((m, e), device="cuda", dtype=dtype)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
world_size, dp_size = world_dp_size
a_scale1 = (
torch.randn(
(m if per_act_token else 1, 1), device="cuda", dtype=torch.float32
)
/ 10.0
)
if not per_act_token:
a_scale1 = a_scale1.repeat(world_size, 1)
parallel_launch(
world_size,
_pplx_moe,
dp_size,
a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
a_scale1,
dtype,
a,
w1_d,
w2_d,
per_act_token,
per_out_ch,
use_internode,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the MOE layers.
Run `pytest tests/kernels/test_pplx_moe.py`.
"""
import copy
import itertools
import textwrap
import traceback
from collections.abc import Callable
import pytest
import torch
try:
from pplx_kernels import AllToAll
from pplx_kernels.nvshmem import (
nvshmem_alloc_empty_unique_id,
nvshmem_finalize,
nvshmem_get_unique_id,
nvshmem_init,
)
has_pplx = True
except ImportError:
has_pplx = False
from tests.kernels.moe.modular_kernel_tools.parallel_utils import _set_vllm_config
from tests.kernels.moe.utils import (
make_dummy_moe_config,
make_shared_experts,
make_test_weights,
naive_batched_moe,
)
from tests.kernels.quant_utils import dequant
from tests.kernels.utils import torch_experts
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe import fused_topk, override_config
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.fused_batched_moe import BatchedTritonExperts
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate,
)
from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
requires_pplx = pytest.mark.skipif(
not has_pplx,
reason="Requires PPLX kernels",
)
BATCHED_MOE_MNK_FACTORS = [
(1, 128, 128),
(33, 2048, 128),
(64, 128, 2048),
(222, 128, 128),
(222, 2048, 1024),
]
PPLX_COMBOS = [
# TODO(bnell): figure out why this fails, seems to be test problem
# (1, 128, 128),
(2, 128, 512),
(3, 1024, 2048),
(4, 128, 128),
(32, 1024, 512),
(45, 512, 2048),
(64, 1024, 512),
(222, 2048, 1024),
(256, 1408, 2048),
]
NUM_EXPERTS = [8, 64]
TOP_KS = [1, 2, 6]
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
vllm_config = VllmConfig()
def torch_prepare(
a: torch.Tensor,
topk_ids: torch.Tensor,
num_experts: int,
max_num_tokens: int | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
assert topk_ids.dim() == 2
assert topk_ids.shape[0] == a.shape[0]
num_tokens, hidden_dim = a.shape
topk = topk_ids.shape[1]
tokens_per_expert = torch.bincount(topk_ids.view(-1), minlength=num_experts)
assert tokens_per_expert.numel() == num_experts
if max_num_tokens is None:
max_num_tokens = int(tokens_per_expert.max().item())
b_a = torch.zeros(
(num_experts, max_num_tokens, hidden_dim), dtype=a.dtype, device=a.device
)
token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device)
for token in range(num_tokens):
for j in range(topk):
expert_id = topk_ids[token, j]
idx = token_counts[expert_id]
b_a[expert_id, idx : idx + 1, :] = a[token, :]
token_counts[expert_id] = token_counts[expert_id] + 1
return b_a, tokens_per_expert
def torch_finalize(
b_out: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor
) -> torch.Tensor:
num_tokens = topk_ids.shape[0]
num_experts = b_out.shape[0]
K = b_out.shape[-1]
out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device)
expert_counts = torch.zeros(num_experts, dtype=torch.int, device=b_out.device)
for token in range(num_tokens):
expert_ids = topk_ids[token]
for i in range(expert_ids.numel()):
expert_id = expert_ids[i]
idx = expert_counts[expert_id]
out[token, :] = (
out[token, :]
+ b_out[expert_id, idx : idx + 1, :] * topk_weight[token, i]
)
expert_counts[expert_id] = expert_counts[expert_id] + 1
return out
def torch_batched_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weight: torch.Tensor,
topk_ids: torch.Tensor,
) -> torch.Tensor:
num_experts = w1.shape[0]
b_a, tokens_per_expert = torch_prepare(a, topk_ids, num_experts)
assert b_a.dim() == 3
num_tokens, topk = topk_ids.shape
_, max_num_tokens, K = b_a.shape
assert num_experts == b_a.shape[0] and w2.shape[1] == K
out = torch.zeros(
(num_experts, max_num_tokens, K), dtype=b_a.dtype, device=b_a.device
)
tmp = torch.empty(
(max_num_tokens, w1.shape[1] // 2), dtype=b_a.dtype, device=b_a.device
)
for expert in range(num_experts):
num = tokens_per_expert[expert]
if num > 0:
torch.ops._C.silu_and_mul(
tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)
)
out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1)
return torch_finalize(out, topk_weight, topk_ids)
@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_fused_moe_batched_experts(
m: int,
n: int,
k: int,
e: int,
topk: int,
dtype: torch.dtype,
workspace_init,
):
set_random_seed(7)
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
score = torch.randn((m, e), device="cuda", dtype=dtype)
with set_current_vllm_config(vllm_config):
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
baseline_output = torch_experts(
a, w1, w2, topk_weight, topk_ids
) # only for baseline
torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
batched_output = naive_batched_moe(
a, w1, w2, topk_weight, topk_ids
) # pick torch_experts or this
torch.testing.assert_close(baseline_output, torch_output, atol=2e-2, rtol=0)
torch.testing.assert_close(baseline_output, batched_output, atol=2e-2, rtol=0)
def create_pplx_prepare_finalize(
num_tokens: int,
hidden_dim: int,
topk: int,
num_experts: int,
rank: int,
dp_size: int,
world_size: int,
in_dtype: torch.dtype,
quant_dtype: torch.dtype | None,
block_shape: list[int] | None,
per_act_token_quant: bool,
group_name: str | None,
):
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
PplxPrepareAndFinalize,
pplx_hidden_dim_scale_bytes,
)
max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
num_local_experts = rank_chunk(num_experts, 0, world_size)
hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
max_num_tokens,
hidden_dim,
in_dtype,
quant_dtype,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
)
args = dict(
max_num_tokens=max_num_tokens,
num_experts=num_experts,
experts_per_token=topk,
rank=rank,
world_size=world_size,
dp_size=dp_size,
hidden_dim=hidden_dim,
hidden_dim_bytes=hidden_dim_bytes,
hidden_dim_scale_bytes=scale_bytes,
)
if group_name is None:
ata = AllToAll.internode(**args)
else:
args["group_name"] = group_name
ata = AllToAll.intranode(**args)
prepare_finalize = PplxPrepareAndFinalize(
ata,
max_num_tokens=max_num_tokens,
num_local_experts=num_local_experts,
num_dispatchers=world_size // dp_size,
)
return prepare_finalize, ata
def rank_chunk(num: int, r: int, w: int) -> int:
rem = num % w
return (num // w) + (1 if r < rem else 0)
def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
chunk = rank_chunk(t.shape[0], r, w)
return t[(r * chunk) : (r + 1) * chunk]
def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
if t is not None:
return chunk_by_rank(t, r, w)
else:
return t
def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
if t is not None and t.numel() > 1:
chunk = rank_chunk(t.shape[0], r, w)
return t[(r * chunk) : (r + 1) * chunk]
else:
return t
def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None:
if t is not None and t.numel() > 1:
return t[start:end]
else:
return t
def dummy_work(a: torch.Tensor) -> torch.Tensor:
return a * 1.1
def pplx_prepare_finalize(
pgi: ProcessGroupInfo,
dp_size: int,
a: torch.Tensor,
topk_weight: torch.Tensor,
topk_ids: torch.Tensor,
num_experts: int,
quant_dtype: torch.dtype | None,
block_shape: list[int] | None,
per_act_token_quant: bool,
group_name: str | None,
) -> torch.Tensor:
assert torch.cuda.current_device() == pgi.local_rank
topk = topk_ids.shape[1]
num_tokens, hidden_dim = a.shape
device = pgi.device
rank = pgi.rank
world_size = pgi.world_size
topk_ids = topk_ids.to(dtype=torch.uint32)
prepare_finalize, ata = create_pplx_prepare_finalize(
num_tokens,
hidden_dim,
topk,
num_experts,
rank,
dp_size,
world_size,
a.dtype,
quant_dtype,
block_shape,
per_act_token_quant,
group_name,
)
assert a.shape[0] == topk_ids.shape[0]
a_chunk = chunk_by_rank(a, rank, world_size).to(device)
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
assert a_chunk.shape[0] == chunk_topk_ids.shape[0]
out = torch.full(
a_chunk.shape,
torch.nan,
dtype=a.dtype,
device=device,
)
if quant_dtype is not None and not per_act_token_quant and block_shape is None:
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
else:
a1_scale = None
a2_scale = None
b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
a_chunk,
chunk_topk_weight,
chunk_topk_ids,
num_experts,
None,
False,
FusedMoEQuantConfig.make(
quant_dtype,
per_act_token_quant=per_act_token_quant,
per_out_ch_quant=False,
block_shape=block_shape,
a1_scale=a1_scale,
a2_scale=a2_scale,
),
)
b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
prepare_finalize.finalize(
out,
b_a,
chunk_topk_weight,
chunk_topk_ids,
False,
weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
)
torch.cuda.synchronize()
ata.destroy()
num_tokens = a_chunk.shape[0]
return out[:num_tokens]
def _pplx_prepare_finalize(
pgi: ProcessGroupInfo,
dp_size: int,
a: torch.Tensor,
score: torch.Tensor,
topk: torch.Tensor,
num_experts: int,
quant_dtype: torch.dtype | None,
block_shape: list[int] | None,
per_act_token_quant: bool,
use_internode: bool,
):
try:
if use_internode:
uid = (
nvshmem_get_unique_id()
if pgi.rank == 0
else nvshmem_alloc_empty_unique_id()
)
torch.distributed.broadcast(uid, src=0)
nvshmem_init(uid, pgi.rank, pgi.world_size)
group_name = None
else:
group_ranks = list(range(pgi.world_size))
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
group_name = cpu_group.group_name
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
m, k = a.shape
a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
torch_output = (
a_rep.view(m, topk, k) * topk_weight.view(m, topk, 1).to(a_rep.dtype)
).sum(dim=1)
pplx_output = pplx_prepare_finalize(
pgi,
dp_size,
a,
topk_weight,
topk_ids,
num_experts,
quant_dtype,
block_shape,
per_act_token_quant,
group_name,
)
torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(
pgi.device
)
torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
finally:
if use_internode:
nvshmem_finalize()
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
@multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize_slow(
mnk: tuple[int, int, int],
e: int,
topk: int,
dtype: torch.dtype,
world_dp_size: tuple[int, int],
per_act_token_quant: bool,
block_shape: list[int] | None,
use_internode: bool,
):
if dtype == torch.float8_e4m3fn:
use_fp8_w8a8 = True
act_dtype = torch.bfloat16
quant_dtype = dtype
else:
use_fp8_w8a8 = False
act_dtype = dtype
quant_dtype = None
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
pytest.skip("Skip quantization test for non-quantized type")
if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination")
set_random_seed(7)
m, n, k = mnk
world_size, dp_size = world_dp_size
device = "cuda"
a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
score = torch.randn((m, e), device=device, dtype=act_dtype)
parallel_launch(
world_size,
_pplx_prepare_finalize,
dp_size,
a,
score,
topk,
e,
quant_dtype,
block_shape,
per_act_token_quant,
use_internode,
)
def pplx_moe(
group_name: str | None,
rank: int,
world_size: int,
dp_size: int,
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
topk_weight: torch.Tensor,
topk_ids: torch.Tensor,
w1_scale: torch.Tensor | None = None,
w2_scale: torch.Tensor | None = None,
a1_scale: torch.Tensor | None = None,
a2_scale: torch.Tensor | None = None,
quant_dtype: torch.dtype | None = None,
per_act_token_quant=False,
block_shape: list[int] | None = None,
use_compile: bool = False,
use_cudagraphs: bool = True,
shared_experts: torch.nn.Module | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
num_tokens, hidden_dim = a.shape
num_experts = w1.shape[0]
topk = topk_ids.shape[1]
max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 16)
prepare_finalize, ata = create_pplx_prepare_finalize(
num_tokens,
hidden_dim,
topk,
num_experts,
rank,
dp_size,
world_size,
a.dtype,
quant_dtype,
block_shape,
per_act_token_quant,
group_name,
)
topk_ids = topk_ids.to(dtype=torch.uint32)
# Note: workers with the same dp_rank must use the exact same inputs.
a_chunk = chunk_by_rank(a, rank, world_size)
chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size)
chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size)
# Chunking weights like this only works for batched format
w1_chunk = chunk_by_rank(w1, rank, world_size)
w2_chunk = chunk_by_rank(w2, rank, world_size)
w1_scale_chunk = maybe_chunk_by_rank(w1_scale, rank, world_size)
w2_scale_chunk = maybe_chunk_by_rank(w2_scale, rank, world_size)
a1_scale_chunk = chunk_scales_by_rank(a1_scale, rank, world_size)
a2_scale_chunk = chunk_scales_by_rank(a2_scale, rank, world_size)
quant_config = FusedMoEQuantConfig.make(
quant_dtype,
block_shape=block_shape,
per_act_token_quant=per_act_token_quant,
w1_scale=w1_scale_chunk,
w2_scale=w2_scale_chunk,
a1_scale=a1_scale_chunk,
a2_scale=a2_scale_chunk,
)
experts = BatchedTritonExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=prepare_finalize.num_dispatchers(),
quant_config=quant_config,
moe_config=make_dummy_moe_config(),
)
fused_experts = FusedMoEModularKernel(
prepare_finalize,
experts,
shared_experts,
inplace=False,
)
# Note: for now use_compile will error out if the problem size is
# large enough to trigger chunking. I'm leaving the flag and
# setup code in case we are able to revisit this later.
if use_compile:
_fused_experts = torch.compile(
fused_experts, backend="inductor", fullgraph=True
)
torch._dynamo.mark_dynamic(a_chunk, 0)
torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
else:
_fused_experts = fused_experts
out = _fused_experts(
a_chunk,
w1_chunk,
w2_chunk,
chunk_topk_weight,
chunk_topk_ids,
global_num_experts=num_experts,
)
if use_cudagraphs:
if isinstance(out, tuple):
out[0].fill_(0)
out[1].fill_(0)
else:
out.fill_(0)
stream = torch.cuda.Stream()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=stream):
out = _fused_experts(
a_chunk,
w1_chunk,
w2_chunk,
chunk_topk_weight,
chunk_topk_ids,
global_num_experts=num_experts,
)
torch.cuda.synchronize()
graph.replay()
torch.cuda.synchronize()
ata.destroy()
return out
def _pplx_moe(
pgi: ProcessGroupInfo,
dp_size: int,
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
score: torch.Tensor,
topk: int,
num_experts: int,
w1_s: torch.Tensor | None = None,
w2_s: torch.Tensor | None = None,
quant_dtype: torch.dtype | None = None,
per_act_token_quant: bool = False,
block_shape: list[int] | None = None,
use_internode: bool = False,
shared_experts: torch.nn.Module | None = None,
):
try:
if use_internode:
uid = (
nvshmem_get_unique_id()
if pgi.rank == 0
else nvshmem_alloc_empty_unique_id()
)
torch.distributed.broadcast(uid, src=0)
nvshmem_init(uid, pgi.rank, pgi.world_size)
group_name = None
else:
group_ranks = list(range(pgi.world_size))
cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
group_name = cpu_group.group_name
m, k = a.shape
e, _, n = w2.shape
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
device = torch.device("cuda", pgi.rank)
rank = pgi.rank
world_size = pgi.world_size
a = a.to(device)
w1 = w1.to(device)
w2 = w2.to(device)
w1_s = w1_s.to(device) if w1_s is not None else None
w2_s = w2_s.to(device) if w2_s is not None else None
if quant_dtype is not None and not per_act_token_quant and block_shape is None:
a1_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
a2_scale = torch.tensor(1.0, device="cuda", dtype=torch.float32)
else:
a1_scale = None
a2_scale = None
with set_current_vllm_config(vllm_config), override_config(moe_config):
topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
shared_output = shared_experts(a) if shared_experts is not None else None
torch_output = torch_experts(
a,
w1,
w2,
topk_weight,
topk_ids,
w1_scale=w1_s,
w2_scale=w2_s,
a1_scale=a1_scale,
a2_scale=a2_scale,
quant_dtype=quant_dtype,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
)
batched_output = naive_batched_moe(
a,
w1,
w2,
topk_weight,
topk_ids,
w1_scale=w1_s,
w2_scale=w2_s,
a1_scale=a1_scale,
a2_scale=a2_scale,
quant_dtype=quant_dtype,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
)
pplx_outputs = pplx_moe(
group_name,
rank,
world_size,
dp_size,
a,
w1,
w2,
topk_weight,
topk_ids,
w1_scale=w1_s,
w2_scale=w2_s,
a1_scale=a1_scale,
a2_scale=a2_scale,
quant_dtype=quant_dtype,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
shared_experts=shared_experts,
)
if shared_experts is None:
pplx_shared_output = None
pplx_output = pplx_outputs
assert isinstance(pplx_output, torch.Tensor)
else:
pplx_shared_output, pplx_output = pplx_outputs
if shared_output is not None:
assert pplx_shared_output is not None
chunked_shared_output = chunk_by_rank(
shared_output, pgi.rank, pgi.world_size
).to(pplx_shared_output.device)
else:
chunked_shared_output = None
chunked_batch_output = chunk_by_rank(
batched_output, pgi.rank, pgi.world_size
).to(pplx_output.device)
torch.testing.assert_close(batched_output, torch_output, atol=3e-2, rtol=3e-2)
torch.testing.assert_close(
pplx_output, chunked_batch_output, atol=3e-2, rtol=3e-2
)
if shared_experts is not None:
assert chunked_shared_output is not None
assert pplx_shared_output is not None
torch.testing.assert_close(
pplx_shared_output, chunked_shared_output, atol=3e-2, rtol=3e-2
)
finally:
if use_internode:
nvshmem_finalize()
@pytest.mark.parametrize("mnk", PPLX_COMBOS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.optional
@requires_pplx
@multi_gpu_test(num_gpus=2)
def test_pplx_moe_slow(
mnk: tuple[int, int, int],
e: int,
topk: int,
dtype: torch.dtype,
world_dp_size: tuple[int, int],
per_act_token_quant: bool,
block_shape: list[int] | None,
use_internode: bool,
):
set_random_seed(7)
m, n, k = mnk
world_size, dp_size = world_dp_size
if dtype == torch.float8_e4m3fn:
use_fp8_w8a8 = True
quant_dtype = dtype
else:
use_fp8_w8a8 = False
quant_dtype = None
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
pytest.skip("Skip quantization test for non-quantized type")
if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination")
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
(_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
e,
n,
k,
quant_dtype=quant_dtype,
block_shape=block_shape,
per_out_ch_quant=per_act_token_quant,
)
parallel_launch(
world_size,
_pplx_moe,
dp_size,
a,
w1,
w2,
score,
topk,
e,
w1_s,
w2_s,
quant_dtype,
per_act_token_quant,
block_shape,
use_internode,
)
def _pplx_test_loop(
pgi: ProcessGroupInfo,
dp_size: int,
use_internode: bool,
use_shared_experts: bool,
make_weights: bool,
test_fn: Callable,
):
device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device)
def format_result(msg, ex=None):
if ex is not None:
x = str(ex)
newx = x.strip(" \n\t")[:16]
if len(newx) < len(x):
newx = newx + " ..."
prefix = "E\t"
print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
print(f"FAILED {msg} - {newx}\n")
else:
print(f"PASSED {msg}")
if use_shared_experts:
# Note: this config is only needed for the non-naive shared experts.
new_vllm_config = copy.deepcopy(vllm_config)
new_vllm_config.parallel_config.data_parallel_size = pgi.world_size
new_vllm_config.parallel_config.enable_expert_parallel = True
_set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
set_random_seed(7)
combos = itertools.product(
PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
)
exceptions = []
count = 0
for mnk, e, topk, dtype, per_act_token_quant, block_shape in combos:
count = count + 1
m, n, k = mnk
if dtype == torch.float8_e4m3fn:
use_fp8_w8a8 = True
quant_dtype = dtype
else:
use_fp8_w8a8 = False
quant_dtype = None
test_desc = (
f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, "
f"dtype={dtype}, per_act_token={per_act_token_quant}, "
f"block_shape={block_shape}, use_internode={use_internode}, "
f"use_shared_experts={use_shared_experts}"
)
if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
print(f"{test_desc} - Skip quantization test for non-quantized type.")
continue
if per_act_token_quant and block_shape is not None:
print(f"{test_desc} - Skip illegal quantization combination.")
continue
a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
args = dict()
if make_weights:
(_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
e,
n,
k,
quant_dtype=quant_dtype,
block_shape=block_shape,
per_out_ch_quant=per_act_token_quant,
)
args["w1"] = w1
args["w2"] = w2
args["w1_s"] = w1_s
args["w2_s"] = w2_s
if use_shared_experts:
args["shared_experts"] = make_shared_experts(
n,
k,
in_dtype=a.dtype,
quant_dtype=quant_dtype,
)
try:
test_fn(
pgi=pgi,
dp_size=dp_size,
a=a,
score=score,
topk=topk,
num_experts=e,
quant_dtype=quant_dtype,
per_act_token_quant=per_act_token_quant,
block_shape=block_shape,
use_internode=use_internode,
**args,
)
format_result(test_desc)
except Exception as ex:
format_result(test_desc, ex)
exceptions.append(ex)
if len(exceptions) > 0:
raise RuntimeError(
f"{len(exceptions)} of {count} tests failed in child process, "
f"rank={pgi.rank}."
)
else:
print(f"{count} of {count} tests passed in child process, rank={pgi.rank}.")
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@requires_pplx
@multi_gpu_test(num_gpus=2)
def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int],
use_internode: bool,
):
set_random_seed(7)
world_size, dp_size = world_dp_size
parallel_launch(
world_size * dp_size,
_pplx_test_loop,
dp_size,
use_internode,
False,
False,
_pplx_prepare_finalize,
)
@pytest.mark.parametrize("world_dp_size", [[2, 1]])
@pytest.mark.parametrize("use_internode", [False])
@pytest.mark.parametrize("use_shared_experts", [False, True])
@requires_pplx
@multi_gpu_test(num_gpus=2)
def test_pplx_moe(
world_dp_size: tuple[int, int],
use_internode: bool,
use_shared_experts: bool,
):
set_random_seed(7)
world_size, dp_size = world_dp_size
parallel_launch(
world_size,
_pplx_test_loop,
dp_size,
use_internode,
use_shared_experts,
True,
_pplx_moe,
)
......@@ -4,7 +4,7 @@ Large-scale cluster-level expert parallel, as described in the [DeepSeek-V3 Tech
Here we break down the requirements in 2 steps:
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
1. Build and install the Python libraries ([DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
Step 2 is necessary for multi-node deployment.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment