Unverified Commit 0c637391 authored by liuzhenwei's avatar liuzhenwei Committed by GitHub
Browse files

[EPD] update EPD script arguments (#36742)


Signed-off-by: default avatarzhenwei-intel <zhenwei.liu@intel.com>
parent 719735d6
......@@ -26,8 +26,13 @@ MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
# Use specific storage path
EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
# Run on XPU; scripts switch from CUDA_VISIBLE_DEVICES to ZE_AFFINITY_MASK
DEVICE_PLATFORM=xpu GPU_E=0 GPU_PD=1 bash disagg_1e1pd_example.sh
```
`DEVICE_PLATFORM` defaults to `cuda`. Set `DEVICE_PLATFORM=xpu` when running these examples on Intel GPUs so the scripts use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` for device selection.
## Encoder Instances
Encoder engines should be launched with the following flags:
......
......@@ -19,11 +19,29 @@ GPU_E="${GPU_E:-2}"
GPU_P="${GPU_P:-2}"
GPU_D="${GPU_D:-3}"
# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
# Serve args
GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
GPU_MEMORY_UTILIZATION_P="${GPU_MEMORY_UTILIZATION_P:-0.7}"
GPU_MEMORY_UTILIZATION_D="${GPU_MEMORY_UTILIZATION_D:-0.7}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
export UCX_TLS=all
export UCX_NET_DEVICES=all
......@@ -92,14 +110,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
--port "$ENCODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--max-num-seqs "$MAX_NUM_SEQS" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
......@@ -115,15 +133,16 @@ PIDS+=($!)
###############################################################################
# Prefill worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_P" \
env "$DEVICE_AFFINITY_ENV=$GPU_P" \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_P" \
--port "$PREFILL_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
......@@ -143,15 +162,16 @@ PIDS+=($!)
###############################################################################
# Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_D" \
env "$DEVICE_AFFINITY_ENV=$GPU_D" \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_D" \
--port "$DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
......
......@@ -17,11 +17,28 @@ PROXY_PORT="${PROXY_PORT:-10001}"
GPU_E="${GPU_E:-0}"
GPU_PD="${GPU_PD:-1}"
# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
# Serve args
GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
GPU_MEMORY_UTILIZATION_PD="${GPU_MEMORY_UTILIZATION_PD:-0.7}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
###############################################################################
# Helpers
###############################################################################
......@@ -86,14 +103,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
--port "$ENCODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--max-num-seqs "$MAX_NUM_SEQS" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
......@@ -109,12 +126,13 @@ PIDS+=($!)
###############################################################################
# Prefill+Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
env "$DEVICE_AFFINITY_ENV=$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_PD" \
--port "$PREFILL_DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECExampleConnector",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment