Unverified Commit 0c637391 authored by liuzhenwei's avatar liuzhenwei Committed by GitHub
Browse files

[EPD] update EPD script arguments (#36742)


Signed-off-by: default avatarzhenwei-intel <zhenwei.liu@intel.com>
parent 719735d6
...@@ -26,8 +26,13 @@ MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh ...@@ -26,8 +26,13 @@ MODEL="Qwen/Qwen2.5-VL-3B-Instruct" bash disagg_1e1p1d_example.sh
# Use specific storage path # Use specific storage path
EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh EC_SHARED_STORAGE_PATH="/tmp/my_ec_cache" bash disagg_1e1p1d_example.sh
# Run on XPU; scripts switch from CUDA_VISIBLE_DEVICES to ZE_AFFINITY_MASK
DEVICE_PLATFORM=xpu GPU_E=0 GPU_PD=1 bash disagg_1e1pd_example.sh
``` ```
`DEVICE_PLATFORM` defaults to `cuda`. Set `DEVICE_PLATFORM=xpu` when running these examples on Intel GPUs so the scripts use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` for device selection.
## Encoder Instances ## Encoder Instances
Encoder engines should be launched with the following flags: Encoder engines should be launched with the following flags:
......
...@@ -19,11 +19,29 @@ GPU_E="${GPU_E:-2}" ...@@ -19,11 +19,29 @@ GPU_E="${GPU_E:-2}"
GPU_P="${GPU_P:-2}" GPU_P="${GPU_P:-2}"
GPU_D="${GPU_D:-3}" GPU_D="${GPU_D:-3}"
# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}" EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
# Serve args
GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
GPU_MEMORY_UTILIZATION_P="${GPU_MEMORY_UTILIZATION_P:-0.7}"
GPU_MEMORY_UTILIZATION_D="${GPU_MEMORY_UTILIZATION_D:-0.7}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
export UCX_TLS=all export UCX_TLS=all
export UCX_NET_DEVICES=all export UCX_NET_DEVICES=all
...@@ -92,14 +110,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH" ...@@ -92,14 +110,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
############################################################################### ###############################################################################
# Encoder worker # Encoder worker
############################################################################### ###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
--port "$ENCODE_PORT" \ --port "$ENCODE_PORT" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--max-num-batched-tokens 114688 \ --max-num-batched-tokens 114688 \
--max-num-seqs 128 \ --max-num-seqs "$MAX_NUM_SEQS" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
...@@ -115,15 +133,16 @@ PIDS+=($!) ...@@ -115,15 +133,16 @@ PIDS+=($!)
############################################################################### ###############################################################################
# Prefill worker # Prefill worker
############################################################################### ###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_P" \ env "$DEVICE_AFFINITY_ENV=$GPU_P" \
UCX_NET_DEVICES=all \ UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \ VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_P" \
--port "$PREFILL_PORT" \ --port "$PREFILL_PORT" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
...@@ -143,15 +162,16 @@ PIDS+=($!) ...@@ -143,15 +162,16 @@ PIDS+=($!)
############################################################################### ###############################################################################
# Decode worker # Decode worker
############################################################################### ###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_D" \ env "$DEVICE_AFFINITY_ENV=$GPU_D" \
UCX_NET_DEVICES=all \ UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \ VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
vllm serve "$MODEL" \ vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_D" \
--port "$DECODE_PORT" \ --port "$DECODE_PORT" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--kv-transfer-config '{ --kv-transfer-config '{
"kv_connector": "NixlConnector", "kv_connector": "NixlConnector",
......
...@@ -17,11 +17,28 @@ PROXY_PORT="${PROXY_PORT:-10001}" ...@@ -17,11 +17,28 @@ PROXY_PORT="${PROXY_PORT:-10001}"
GPU_E="${GPU_E:-0}" GPU_E="${GPU_E:-0}"
GPU_PD="${GPU_PD:-1}" GPU_PD="${GPU_PD:-1}"
# Device platform and affinity env name.
# DEVICE_PLATFORM supports: cuda, xpu
DEVICE_PLATFORM="${DEVICE_PLATFORM:-cuda}"
if [[ -z "${DEVICE_AFFINITY_ENV:-}" ]]; then
if [[ "${DEVICE_PLATFORM,,}" == "xpu" ]]; then
DEVICE_AFFINITY_ENV="ZE_AFFINITY_MASK"
else
DEVICE_AFFINITY_ENV="CUDA_VISIBLE_DEVICES"
fi
fi
EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}" EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout
NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark
# Serve args
GPU_MEMORY_UTILIZATION_E="${GPU_MEMORY_UTILIZATION_E:-0.01}"
GPU_MEMORY_UTILIZATION_PD="${GPU_MEMORY_UTILIZATION_PD:-0.7}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-128}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
############################################################################### ###############################################################################
# Helpers # Helpers
############################################################################### ###############################################################################
...@@ -86,14 +103,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH" ...@@ -86,14 +103,14 @@ mkdir -p "$EC_SHARED_STORAGE_PATH"
############################################################################### ###############################################################################
# Encoder worker # Encoder worker
############################################################################### ###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ env "$DEVICE_AFFINITY_ENV=$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_E" \
--port "$ENCODE_PORT" \ --port "$ENCODE_PORT" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--no-enable-prefix-caching \ --no-enable-prefix-caching \
--max-num-batched-tokens 114688 \ --max-num-batched-tokens 114688 \
--max-num-seqs 128 \ --max-num-seqs "$MAX_NUM_SEQS" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
...@@ -109,12 +126,13 @@ PIDS+=($!) ...@@ -109,12 +126,13 @@ PIDS+=($!)
############################################################################### ###############################################################################
# Prefill+Decode worker # Prefill+Decode worker
############################################################################### ###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ env "$DEVICE_AFFINITY_ENV=$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \ --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION_PD" \
--port "$PREFILL_DECODE_PORT" \ --port "$PREFILL_DECODE_PORT" \
--enforce-eager \ --enforce-eager \
--enable-request-id-headers \ --enable-request-id-headers \
--max-num-seqs 128 \ --max-num-seqs "$MAX_NUM_SEQS" \
--max-model-len "$MAX_MODEL_LEN" \
--allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \ --allowed-local-media-path "${GIT_ROOT}"/tests/v1/ec_connector/integration \
--ec-transfer-config '{ --ec-transfer-config '{
"ec_connector": "ECExampleConnector", "ec_connector": "ECExampleConnector",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment