Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b4054c8a
Unverified
Commit
b4054c8a
authored
Dec 10, 2025
by
Sage Moore
Committed by
GitHub
Dec 11, 2025
Browse files
Revert "[CI] Add Async Eplb nightly CI tests (#29385)" (#30431)
parent
25221b44
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
4 additions
and
167 deletions
+4
-167
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
...eduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+0
-73
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
...ts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+0
-1
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
...s/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+0
-74
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-19
vllm/distributed/eplb/rebalance_execute.py
vllm/distributed/eplb/rebalance_execute.py
+3
-0
No files found.
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
deleted
100644 → 0
View file @
25221b44
#!/usr/bin/env bash
set
-euxo
pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD
=
${
1
:-
0
.25
}
NUM_Q
=
${
2
:-
1319
}
PORT
=
${
3
:-
8030
}
OUT_DIR
=
${
OUT_DIR
:-
/tmp/vllm-scheduled
}
mkdir
-p
"
${
OUT_DIR
}
"
wait_for_server
()
{
local
port
=
$1
timeout
600 bash
-c
'
until curl -sf "http://127.0.0.1:'
"
$port
"
'/health" > /dev/null; do
sleep 1
done'
}
MODEL
=
"deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if
command
-v
rocm-smi &> /dev/null
||
[[
-d
/opt/rocm
]]
||
[[
-n
"
${
ROCM_PATH
:-}
"
]]
;
then
# ROCm platform
BACKENDS
=(
"allgather_reducescatter"
)
# Disable MOE padding for ROCm since it is causing eplb to fail
export
VLLM_ROCM_MOE_PADDING
=
0
else
# Non-ROCm platform (CUDA/other)
BACKENDS
=(
"deepep_high_throughput"
"deepep_low_latency"
)
fi
cleanup
()
{
if
[[
-n
"
${
SERVER_PID
:-}
"
]]
&&
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
;
then
kill
"
${
SERVER_PID
}
"
2>/dev/null
||
true
for
_
in
{
1..20
}
;
do
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
||
break
sleep
0.5
done
kill
-9
"
${
SERVER_PID
}
"
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
2
\
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--trust-remote-code
\
--max-model-len
2048
\
--port
$PORT
&
SERVER_PID
=
$!
wait_for_server
$PORT
TAG
=
$(
echo
"
$MODEL
"
|
tr
'/: \\n'
'_____'
)
OUT
=
"
${
OUT_DIR
}
/
${
TAG
}
_
${
BACK
}
_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py
--host
http://127.0.0.1
--port
$PORT
--num-questions
${
NUM_Q
}
--save-results
${
OUT
}
python3 -
<<
PY
import json; acc=json.load(open('
${
OUT
}
'))['accuracy']
print(f"
${
MODEL
}
${
BACK
}
: accuracy {acc:.3f}")
assert acc >=
${
THRESHOLD
}
, f"
${
MODEL
}
${
BACK
}
accuracy {acc}"
PY
cleanup
SERVER_PID
=
sleep
1
PORT
=
$((
PORT+1
))
done
.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
View file @
b4054c8a
...
...
@@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size
2
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600}'
\
--trust-remote-code
\
--max-model-len
2048
\
--port
$PORT
&
...
...
.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
deleted
100644 → 0
View file @
25221b44
#!/usr/bin/env bash
set
-euxo
pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD
=
${
1
:-
0
.25
}
NUM_Q
=
${
2
:-
1319
}
PORT
=
${
3
:-
8040
}
OUT_DIR
=
${
OUT_DIR
:-
/tmp/vllm-scheduled
}
mkdir
-p
"
${
OUT_DIR
}
"
wait_for_server
()
{
local
port
=
$1
timeout
600 bash
-c
'
until curl -sf "http://127.0.0.1:'
"
$port
"
'/health" > /dev/null; do
sleep 1
done'
}
MODEL
=
"Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS based on platform
if
command
-v
rocm-smi &> /dev/null
||
[[
-d
/opt/rocm
]]
||
[[
-n
"
${
ROCM_PATH
:-}
"
]]
;
then
# ROCm platform
BACKENDS
=(
"allgather_reducescatter"
)
# Disable MOE padding for ROCm since it is causing eplb to fail
export
VLLM_ROCM_MOE_PADDING
=
0
else
# Non-ROCm platform (CUDA/other)
BACKENDS
=(
"deepep_high_throughput"
"deepep_low_latency"
)
fi
cleanup
()
{
if
[[
-n
"
${
SERVER_PID
:-}
"
]]
&&
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
;
then
kill
"
${
SERVER_PID
}
"
2>/dev/null
||
true
for
_
in
{
1..20
}
;
do
kill
-0
"
${
SERVER_PID
}
"
2>/dev/null
||
break
sleep
0.5
done
kill
-9
"
${
SERVER_PID
}
"
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
for
BACK
in
"
${
BACKENDS
[@]
}
"
;
do
VLLM_DEEP_GEMM_WARMUP
=
skip
\
VLLM_ALL2ALL_BACKEND
=
$BACK
\
vllm serve
"
$MODEL
"
\
--enforce-eager
\
--tensor-parallel-size
4
\
--enable-expert-parallel
\
--enable-eplb
\
--eplb-config
'{"window_size":200,"step_interval":600,"use_async":true}'
\
--speculative-config
'{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
\
--trust-remote-code
\
--max-model-len
2048
\
--gpu-memory-utilization
0.9
\
--port
$PORT
&
SERVER_PID
=
$!
wait_for_server
$PORT
TAG
=
$(
echo
"
$MODEL
"
|
tr
'/: \\n'
'_____'
)
OUT
=
"
${
OUT_DIR
}
/
${
TAG
}
_
${
BACK
}
.json"
python3 tests/evals/gsm8k/gsm8k_eval.py
--host
http://127.0.0.1
--port
$PORT
--num-questions
${
NUM_Q
}
--save-results
${
OUT
}
python3 -
<<
PY
import json; acc=json.load(open('
${
OUT
}
'))['accuracy']
print(f"
${
MODEL
}
${
BACK
}
: accuracy {acc:.3f}")
assert acc >=
${
THRESHOLD
}
, f"
${
MODEL
}
${
BACK
}
accuracy {acc}"
PY
cleanup
SERVER_PID
=
sleep
1
PORT
=
$((
PORT+1
))
done
.buildkite/test-pipeline.yaml
View file @
b4054c8a
...
...
@@ -1379,22 +1379,4 @@ steps:
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
-
label
:
DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319
8030
-
label
:
Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes
:
60
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319
8040
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
vllm/distributed/eplb/rebalance_execute.py
View file @
b4054c8a
...
...
@@ -322,6 +322,9 @@ async def transfer_layer(
num_local_physical_experts
=
next
(
iter
(
expert_weights
[
0
])).
shape
[
0
]
assert
new_global_expert_indices
.
shape
==
(
num_moe_layers
,
num_physical_experts
)
assert
num_physical_experts
==
ep_size
*
num_local_physical_experts
# A buffer to hold the expert weights in one layer during the exchange.
# NOTE: Currently we assume the same weights across different layers
# have the same shape.
is_unchanged
,
is_received_locally
,
experts_recv_loc
=
move_to_buffer
(
num_local_experts
=
num_local_physical_experts
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment