[EPD] update EPD script arguments (#36742)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>

[EPD] update EPD script arguments (#36742)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
0c637391 · liuzhenwei · GitHub · 719735d6 · 0c637391 · 0c637391
Unverified Commit 0c637391 authored Mar 31, 2026 by liuzhenwei Committed by GitHub Mar 31, 2026
3 changed files
--- a/examples/online_serving/disaggregated_encoder/README.md
+++ b/examples/online_serving/disaggregated_encoder/README.md
@@ -26,8 +26,13 @@ MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh

 # Use specific storage path
 EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
+
+# Run on XPU; scripts switch from CUDA_VISIBLE_DEVICES to ZE_AFFINITY_MASK
+DEVICE_PLATFORM=xpu GPU_E=0 GPU_PD=1 bash disagg_1e1pd_example.sh
 ```

+`DEVICE_PLATFORM` defaults to `cuda`. Set `DEVICE_PLATFORM=xpu` when running these examples on Intel GPUs so the scripts use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` for device selection.
+
 ## Encoder Instances

 Encoder engines should be launched with the following flags:

--- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
@@ -19,11 +19,29 @@ GPU_E="${GPU_E:-2}"
 GPU_P="${GPU_P:-2}"
 GPU_D="${GPU_D:-3}"

+# Device platform and affinity env name.
+# DEVICE_PLATFORM supports: cuda, xpu
+DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
+if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
+    if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+        DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
+    else
+        DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
+    fi
+fi
+
 EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
 TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout

 NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark

+# Serve args
+GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
+GPU_MEMORY_UTILIZATION_P="${GPU_MEMORY_UTILIZATION_P:-0.7}"
+GPU_MEMORY_UTILIZATION_D="${GPU_MEMORY_UTILIZATION_D:-0.7}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+
 export UCX_TLS=all
 export UCX_NET_DEVICES=all

@@ -92,14 +110,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
 ###############################################################################
 # Encoder worker
 ###############################################################################
-CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
-    --gpu-memory-utilization 0.01 \
+env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
    --port "$ENCODE_PORT" \
    --enforce-eager \
    --enable-request-id-headers \
    --no-enable-prefix-caching \
    --max-num-batched-tokens 114688 \
-    --max-num-seqs 128 \
+    --max-num-seqs "$MAX_NUM_SEQS" \
    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
@@ -115,15 +133,16 @@ PIDS+=($!)
 ###############################################################################
 # Prefill worker
 ###############################################################################
-CUDA_VISIBLE_DEVICES="$GPU_P" \
+env "$DEVICE_AFFINITY_ENV=$GPU_P" \
 UCX_NET_DEVICES=all \
 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
 vllm serve "$MODEL" \
-    --gpu-memory-utilization 0.7 \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_P" \
    --port "$PREFILL_PORT" \
    --enforce-eager \
    --enable-request-id-headers \
-    --max-num-seqs 128 \
+    --max-num-seqs "$MAX_NUM_SEQS" \
+    --max-model-len "$MAX_MODEL_LEN" \
    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
@@ -143,15 +162,16 @@ PIDS+=($!)
 ###############################################################################
 # Decode worker
 ###############################################################################
-CUDA_VISIBLE_DEVICES="$GPU_D" \
+env "$DEVICE_AFFINITY_ENV=$GPU_D" \
 UCX_NET_DEVICES=all \
 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
 vllm serve "$MODEL" \
-    --gpu-memory-utilization 0.7 \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_D" \
    --port "$DECODE_PORT" \
    --enforce-eager \
    --enable-request-id-headers \
-    --max-num-seqs 128 \
+    --max-num-seqs "$MAX_NUM_SEQS" \
+    --max-model-len "$MAX_MODEL_LEN" \
    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --kv-transfer-config '{
        "kv_connector": "NixlConnector",

--- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
+++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
@@ -17,11 +17,28 @@ PROXY_PORT="${PROXY_PORT:-10001}"
 GPU_E="${GPU_E:-0}"
 GPU_PD="${GPU_PD:-1}"

+# Device platform and affinity env name.
+# DEVICE_PLATFORM supports: cuda, xpu
+DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
+if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
+    if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
+        DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
+    else
+        DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
+    fi
+fi
+
 EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
 TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}"   # wait_for_server timeout

 NUM_PROMPTS="${NUM_PROMPTS:-100}"    # number of prompts to send in benchmark

+# Serve args
+GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
+GPU_MEMORY_UTILIZATION_PD="${GPU_MEMORY_UTILIZATION_PD:-0.7}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+
 ###############################################################################
 # Helpers
 ###############################################################################
@@ -86,14 +103,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
 ###############################################################################
 # Encoder worker
 ###############################################################################
-CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
-    --gpu-memory-utilization 0.01 \
+env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
    --port "$ENCODE_PORT" \
    --enforce-eager \
    --enable-request-id-headers \
    --no-enable-prefix-caching \
    --max-num-batched-tokens 114688 \
-    --max-num-seqs 128 \
+    --max-num-seqs "$MAX_NUM_SEQS" \
    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",
@@ -109,12 +126,13 @@ PIDS+=($!)
 ###############################################################################
 # Prefill+Decode worker
 ###############################################################################
-CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
-    --gpu-memory-utilization 0.7 \
+env "$DEVICE_AFFINITY_ENV=$GPU_PD" vllm serve "$MODEL" \
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_PD" \
    --port "$PREFILL_DECODE_PORT" \
    --enforce-eager \
    --enable-request-id-headers \
-    --max-num-seqs 128 \
+    --max-num-seqs "$MAX_NUM_SEQS" \
+    --max-model-len "$MAX_MODEL_LEN" \
    --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
    --ec-transfer-config '{
        "ec_connector": "ECExampleConnector",