Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1bc9c77f
Unverified
Commit
1bc9c77f
authored
Mar 09, 2026
by
liuzhenwei
Committed by
GitHub
Mar 09, 2026
Browse files
[XPU] Add test script of PD disaggregation (#36434)
Signed-off-by:
zhenwei-intel
<
zhenwei.liu@intel.com
>
parent
65a4da15
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
174 additions
and
0 deletions
+174
-0
tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
...onnector/nixl_integration/run_xpu_disagg_accuracy_test.sh
+174
-0
No files found.
tests/v1/kv_connector/nixl_integration/run_xpu_disagg_accuracy_test.sh
0 → 100644
View file @
1bc9c77f
#!/bin/bash
set
-e
# Hosts / ports
PREFILL_HOST
=
${
PREFILL_HOST
:-
"localhost"
}
PREFILL_PORT
=
${
PREFILL_PORT
:-
8100
}
PREFILL_NIXL_SIDE_PORT
=
${
PREFILL_NIXL_SIDE_PORT
:-
5577
}
DECODE_HOST
=
${
DECODE_HOST
:-
"localhost"
}
DECODE_PORT
=
${
DECODE_PORT
:-
8200
}
PROXY_HOST
=
${
PROXY_HOST
:-
"localhost"
}
PROXY_PORT
=
${
PROXY_PORT
:-
8192
}
BASELINE_HOST
=
${
BASELINE_HOST
:-
"localhost"
}
BASELINE_PORT
=
${
BASELINE_PORT
:-
9290
}
# Model to run.
MODEL_NAME
=
${
MODEL_NAME
:-
"Qwen/Qwen3-0.6B"
}
MAX_MODEL_LEN
=
${
MAX_MODEL_LEN
:-
1024
}
BLOCK_SIZE
=
${
BLOCK_SIZE
:-
64
}
PREFILLER_TP_SIZE
=
${
PREFILLER_TP_SIZE
:-
1
}
DECODER_TP_SIZE
=
${
DECODER_TP_SIZE
:-
1
}
KV_BUFFER_DEVICE
=
${
KV_BUFFER_DEVICE
:-
"xpu"
}
GPU_MEMORY_UTILIZATION
=
${
GPU_MEMORY_UTILIZATION
:-
0
.8
}
generate_affinity_mask
()
{
local
count
=
$1
local
start
=
${
2
:-
0
}
local
mask
=
""
local
i
for
((
i
=
0
;
i<count
;
i++
))
;
do
local
device
=
$((
start
+
i
))
if
[[
-z
"
${
mask
}
"
]]
;
then
mask
=
"
${
device
}
"
else
mask
=
"
${
mask
}
,
${
device
}
"
fi
done
echo
"
${
mask
}
"
}
PREFILLER_ZE_AFFINITY_MASK
=
${
PREFILLER_ZE_AFFINITY_MASK
:-
$(
generate_affinity_mask
"
${
PREFILLER_TP_SIZE
}
"
0
)
}
DECODER_ZE_AFFINITY_MASK
=
${
DECODER_ZE_AFFINITY_MASK
:-
$(
generate_affinity_mask
"
${
DECODER_TP_SIZE
}
"
"
${
PREFILLER_TP_SIZE
}
"
)
}
# execution env
GIT_ROOT
=
$(
git rev-parse
--show-toplevel
)
EXP_ROOT
=
"
${
GIT_ROOT
}
/tests/v1/kv_connector/nixl_integration"
OUTPUT_FILE
=
${
OUTPUT_FILE
:-
"
${
EXP_ROOT
}
/.xpu_accuracy_test_outputs.txt"
}
# Trap the SIGINT signal (triggered by Ctrl+C)
trap
'kill $(jobs -pr)'
SIGINT SIGTERM EXIT
cleanup
()
{
echo
"Cleaning up any running vLLM instances..."
pkill
-f
"vllm serve"
||
true
sleep
2
}
wait_for_server
()
{
local
host
=
$1
local
port
=
$2
timeout
1200 bash
-c
"
until curl -s
${
host
}
:
${
port
}
/v1/completions > /dev/null; do
sleep 1
done"
&&
return
0
||
return
1
}
launch_baseline
()
{
BASELINE_BASE_CMD
=
"
ZE_AFFINITY_MASK=0
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve
$MODEL_NAME
\
--host
${
BASELINE_HOST
}
\
--port
${
BASELINE_PORT
}
\
--max-model-len
${
MAX_MODEL_LEN
}
\
--seed 42
\
-tp 1
\
--block-size
${
BLOCK_SIZE
}
\
--gpu-memory-utilization
${
GPU_MEMORY_UTILIZATION
}
\
--dtype float16
\
--enforce-eager"
echo
${
BASELINE_BASE_CMD
}
bash
-c
"
${
BASELINE_BASE_CMD
}
"
&
sleep
10
wait_for_server
${
BASELINE_HOST
}
${
BASELINE_PORT
}
}
launch_pd
()
{
PREFILL_BASE_CMD
=
"
ZE_AFFINITY_MASK=
${
PREFILLER_ZE_AFFINITY_MASK
}
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_NIXL_SIDE_CHANNEL_HOST=
${
PREFILL_HOST
}
\
VLLM_NIXL_SIDE_CHANNEL_PORT=
${
PREFILL_NIXL_SIDE_PORT
}
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve
$MODEL_NAME
\
--host
${
PREFILL_HOST
}
\
--port
${
PREFILL_PORT
}
\
--max-model-len
${
MAX_MODEL_LEN
}
\
--seed 42
\
--block-size
${
BLOCK_SIZE
}
\
--enforce-eager
\
--dtype float16
\
-tp
${
PREFILLER_TP_SIZE
}
\
--gpu-memory-utilization
${
GPU_MEMORY_UTILIZATION
}
\
--kv-transfer-config '{
\"
kv_connector
\"
:
\"
NixlConnector
\"
,
\"
kv_role
\"
:
\"
kv_both
\"
,
\"
kv_buffer_device
\"
:
\"
$KV_BUFFER_DEVICE
\"
}'"
DECODE_BASE_CMD
=
"
ZE_AFFINITY_MASK=
${
DECODER_ZE_AFFINITY_MASK
}
\
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200
\
VLLM_WORKER_MULTIPROC_METHOD=spawn
\
VLLM_ENABLE_V1_MULTIPROCESSING=1 vllm serve
$MODEL_NAME
\
--host
${
DECODE_HOST
}
\
--port
${
DECODE_PORT
}
\
--max-model-len
${
MAX_MODEL_LEN
}
\
--seed 42
\
--block-size
${
BLOCK_SIZE
}
\
--enforce-eager
\
-tp
${
DECODER_TP_SIZE
}
\
--dtype float16
\
--gpu-memory-utilization
${
GPU_MEMORY_UTILIZATION
}
\
--kv-transfer-config '{
\"
kv_connector
\"
:
\"
NixlConnector
\"
,
\"
kv_role
\"
:
\"
kv_both
\"
,
\"
kv_buffer_device
\"
:
\"
$KV_BUFFER_DEVICE
\"
}'"
echo
${
PREFILL_BASE_CMD
}
echo
${
DECODE_BASE_CMD
}
sleep
2
# execute on hosts
bash
-c
"
${
PREFILL_BASE_CMD
}
"
&
bash
-c
"
${
DECODE_BASE_CMD
}
"
&
sleep
1
wait_for_server
${
PREFILL_HOST
}
${
PREFILL_PORT
}
sleep
1
wait_for_server
${
DECODE_HOST
}
${
DECODE_PORT
}
sleep
1
}
launch_pd_proxy
(){
PROXY_BASE_CMD
=
"
python3
${
EXP_ROOT
}
/toy_proxy_server.py
\
--prefiller-host
${
PREFILL_HOST
}
--prefiller-port
${
PREFILL_PORT
}
\
--decoder-host
${
DECODE_HOST
}
--decoder-port
${
DECODE_PORT
}
\
--host=
${
PROXY_HOST
}
--port
${
PROXY_PORT
}
"
echo
${
PROXY_BASE_CMD
}
bash
-c
"
${
PROXY_BASE_CMD
}
"
&
sleep
2
}
run_tests
(){
local
service_url
=
$1
local
mode
=
$2
python3
${
EXP_ROOT
}
/test_disagg_accuracy.py
--service_url
=
${
service_url
}
--model_name
=
${
MODEL_NAME
}
--mode
=
${
mode
}
--file_name
=
${
OUTPUT_FILE
}
}
# run non-disagg. baseline & save outputs
launch_baseline
run_tests
"http://
${
BASELINE_HOST
}
:
${
BASELINE_PORT
}
"
"baseline"
cleanup
sleep
10
# run disagg. & do exact-match with the outputs from baseline
launch_pd
launch_pd_proxy
run_tests
"http://
${
PROXY_HOST
}
:
${
PROXY_PORT
}
"
"disagg"
echo
"-----P/D success----"
rm
${
OUTPUT_FILE
}
cleanup
exit
0
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment