Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
906e461e
Unverified
Commit
906e461e
authored
Aug 25, 2025
by
Michael Goin
Committed by
GitHub
Aug 25, 2025
Browse files
[CI Fix] Pin deepep and pplx tags in tools/ep_kernels/, gate multigpu tests (#23568)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
2a97ffc3
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
40 additions
and
12 deletions
+40
-12
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-0
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+5
-7
tests/kernels/moe/test_deepep_deepgemm_moe.py
tests/kernels/moe/test_deepep_deepgemm_moe.py
+3
-0
tests/kernels/moe/test_deepep_moe.py
tests/kernels/moe/test_deepep_moe.py
+3
-0
tests/kernels/moe/test_modular_kernel_combinations.py
tests/kernels/moe/test_modular_kernel_combinations.py
+2
-0
tests/kernels/moe/test_pplx_cutlass_moe.py
tests/kernels/moe/test_pplx_cutlass_moe.py
+2
-0
tests/kernels/moe/test_pplx_moe.py
tests/kernels/moe/test_pplx_moe.py
+5
-0
tests/utils.py
tests/utils.py
+6
-3
tools/ep_kernels/install_python_libraries.sh
tools/ep_kernels/install_python_libraries.sh
+13
-2
No files found.
.buildkite/test-pipeline.yaml
View file @
906e461e
...
...
@@ -390,6 +390,7 @@ steps:
-
csrc/moe/
-
tests/kernels/moe
-
vllm/model_executor/layers/fused_moe/
-
vllm/distributed/device_communicators/
commands
:
-
pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
2
...
...
tests/distributed/test_comm_ops.py
View file @
906e461e
...
...
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
tensor_model_parallel_all_reduce
,
tensor_model_parallel_reduce_scatter
)
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
from
..utils
import
(
init_test_distributed_environment
,
multi_gpu_test
,
multi_process_parallel
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -226,8 +227,7 @@ def send_recv_test_worker(
torch
.
testing
.
assert_close
(
test_tensor
,
recv_tensor
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
,
...
...
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
multi_process_parallel
(
monkeypatch
,
tp_size
,
1
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
send_recv_test_worker
,
send_recv_tensor_dict_test_worker
])
...
...
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
multi_process_parallel
(
monkeypatch
,
1
,
pp_size
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
...
...
tests/kernels/moe/test_deepep_deepgemm_moe.py
View file @
906e461e
...
...
@@ -23,6 +23,7 @@ from vllm.utils import has_deep_ep, has_deep_gemm
from
vllm.utils.deep_gemm
import
(
is_blackwell_deep_gemm_e8m0_used
,
is_deep_gemm_supported
)
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
from
.utils
import
make_test_weights
...
...
@@ -370,6 +371,7 @@ NUM_EXPERTS = [32]
@
pytest
.
mark
.
parametrize
(
"num_experts"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOPKS
)
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
multi_gpu_test
(
num_gpus
=
2
)
@
requires_deep_ep
@
requires_deep_gemm
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_e8m0_used
(),
...
...
@@ -427,6 +429,7 @@ USE_FP8_DISPATCH = [False]
@
pytest
.
mark
.
parametrize
(
"use_fp8_dispatch"
,
USE_FP8_DISPATCH
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
multi_gpu_test
(
num_gpus
=
2
)
@
requires_deep_ep
@
requires_deep_gemm
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_e8m0_used
(),
...
...
tests/kernels/moe/test_deepep_moe.py
View file @
906e461e
...
...
@@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from
vllm.platforms
import
current_platform
from
vllm.utils
import
has_deep_ep
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
if
has_deep_ep
():
...
...
@@ -411,6 +412,7 @@ DTYPES = [torch.bfloat16, torch.float8_e4m3fn]
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
6
])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
@
multi_gpu_test
(
num_gpus
=
2
)
@
requires_deep_ep
def
test_deep_ep_moe
(
dtype
:
torch
.
dtype
,
...
...
@@ -459,6 +461,7 @@ USE_FP8_DISPATCH = [True, False]
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
6
])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
pytest
.
mark
.
parametrize
(
"use_fp8_dispatch"
,
USE_FP8_DISPATCH
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
requires_deep_ep
def
test_low_latency_deep_ep_moe
(
dtype
:
torch
.
dtype
,
mnk
:
tuple
[
int
,
int
,
int
],
num_experts
:
int
,
topk
:
int
,
...
...
tests/kernels/moe/test_modular_kernel_combinations.py
View file @
906e461e
...
...
@@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from
vllm.utils
import
has_deep_ep
,
has_deep_gemm
,
has_pplx
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
...utils
import
multi_gpu_test
from
.modular_kernel_tools.common
import
(
Config
,
RankTensors
,
WeightTensors
,
reference_moe_impl
,
run_modular_kernel
)
...
...
@@ -162,6 +163,7 @@ def is_nyi_config(config: Config) -> bool:
product
(
MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
,
MK_FUSED_EXPERT_TYPES
))
@
pytest
.
mark
.
parametrize
(
"fused_moe_chunk_size"
,
FUSED_MOE_CHUNK_SIZEs
)
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
])
@
multi_gpu_test
(
num_gpus
=
2
)
@
meets_multi_gpu_requirements
def
test_modular_kernel_combinations_multigpu
(
k
:
int
,
n
:
int
,
e
:
int
,
dtype
:
torch
.
dtype
,
...
...
tests/kernels/moe/test_pplx_cutlass_moe.py
View file @
906e461e
...
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from
vllm.platforms
import
current_platform
from
vllm.utils
import
cdiv
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
try
:
...
...
@@ -247,6 +248,7 @@ def _pplx_moe(
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[[
2
,
1
]])
#, [4, 2]])
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
skipif
(
(
lambda
x
:
x
is
None
or
not
ops
.
cutlass_group_gemm_supported
(
x
.
to_int
()))(
current_platform
.
get_device_capability
()),
...
...
tests/kernels/moe/test_pplx_moe.py
View file @
906e461e
...
...
@@ -37,6 +37,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
from
vllm.platforms
import
current_platform
from
vllm.utils
import
round_up
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
requires_pplx
=
pytest
.
mark
.
skipif
(
...
...
@@ -452,6 +453,7 @@ def _pplx_prepare_finalize(
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
pytest
.
mark
.
optional
@
requires_pplx
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_pplx_prepare_finalize_slow
(
mnk
:
tuple
[
int
,
int
,
int
],
e
:
int
,
...
...
@@ -740,6 +742,7 @@ def _pplx_moe(
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
pytest
.
mark
.
optional
@
requires_pplx
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_pplx_moe_slow
(
mnk
:
tuple
[
int
,
int
,
int
],
e
:
int
,
...
...
@@ -880,6 +883,7 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool,
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[[
2
,
1
]])
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
requires_pplx
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_pplx_prepare_finalize
(
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
...
...
@@ -893,6 +897,7 @@ def test_pplx_prepare_finalize(
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[[
2
,
1
]])
@
pytest
.
mark
.
parametrize
(
"use_internode"
,
[
False
])
@
requires_pplx
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_pplx_moe
(
world_dp_size
:
tuple
[
int
,
int
],
use_internode
:
bool
,
...
...
tests/utils.py
View file @
906e461e
...
...
@@ -696,9 +696,12 @@ def multi_process_parallel(
os
.
environ
[
"RAY_RUNTIME_ENV_IGNORE_GITIGNORE"
]
=
"1"
ray
.
init
(
runtime_env
=
{
"working_dir"
:
VLLM_PATH
,
"excludes"
:
[
"build"
,
".git"
,
"cmake-build-*"
,
"shellcheck"
,
"dist"
]
"working_dir"
:
VLLM_PATH
,
"excludes"
:
[
"build"
,
".git"
,
"cmake-build-*"
,
"shellcheck"
,
"dist"
,
"ep_kernels_workspace"
]
})
distributed_init_port
=
get_open_port
()
...
...
tools/ep_kernels/install_python_libraries.sh
View file @
906e461e
...
...
@@ -77,6 +77,7 @@ clone_repo() {
local
repo_url
=
$1
local
dir_name
=
$2
local
key_file
=
$3
local
commit_hash
=
$4
if
[
-d
"
$dir_name
"
]
;
then
# Check if directory has uncommitted changes (dirty)
...
...
@@ -87,17 +88,27 @@ clone_repo() {
echo
"
$dir_name
directory exists but clone appears incomplete, cleaning up and re-cloning"
rm
-rf
"
$dir_name
"
git clone
"
$repo_url
"
if
[
-n
"
$commit_hash
"
]
;
then
cd
"
$dir_name
"
git checkout
"
$commit_hash
"
cd
..
fi
else
echo
"
$dir_name
directory exists and appears complete; manually update if needed"
fi
else
git clone
"
$repo_url
"
if
[
-n
"
$commit_hash
"
]
;
then
cd
"
$dir_name
"
git checkout
"
$commit_hash
"
cd
..
fi
fi
}
# build and install pplx, require pytorch installed
pushd
$WORKSPACE
clone_repo
"https://github.com/ppl-ai/pplx-kernels"
"pplx-kernels"
"setup.py"
clone_repo
"https://github.com/ppl-ai/pplx-kernels"
"pplx-kernels"
"setup.py"
"c336faf"
cd
pplx-kernels
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
...
...
@@ -106,7 +117,7 @@ popd
# build and install deepep, require pytorch installed
pushd
$WORKSPACE
clone_repo
"https://github.com/deepseek-ai/DeepEP"
"DeepEP"
"setup.py"
clone_repo
"https://github.com/deepseek-ai/DeepEP"
"DeepEP"
"setup.py"
"e3908bf"
cd
DeepEP
export
NVSHMEM_DIR
=
$WORKSPACE
/nvshmem_install
PIP_NO_BUILD_ISOLATION
=
0 pip
install
-vvv
-e
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment