"vllm/vscode:/vscode.git/clone" did not exist on "6b2427f995a81377e5758b34c8bb4c66db2f67bc"
Unverified Commit d527cf0b authored by Ev Lacey's avatar Ev Lacey Committed by GitHub
Browse files

[FIX]Patch run-cluster.sh (fix for #28328) (#30002)


Signed-off-by: default avatarelacey <elacey@nvidia.com>
Signed-off-by: default avatarEv Lacey <github@everettlacey.com>
parent 2cc5affc
...@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then ...@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
exit 1 exit 1
fi fi
# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
VLLM_HOST_IP=""
for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
arg="${ADDITIONAL_ARGS[$i]}"
case "${arg}" in
-e)
next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
if [[ "${next}" == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
break
fi
;;
-eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
VLLM_HOST_IP="${arg#*=}"
break
;;
esac
done
# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
echo "Using VLLM_HOST_IP as the head node address."
HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
fi
fi
# Generate a unique container name with random suffix. # Generate a unique container name with random suffix.
# Docker container names must be unique on each host. # Docker container names must be unique on each host.
# The random suffix allows multiple Ray containers to run simultaneously on the same machine, # The random suffix allows multiple Ray containers to run simultaneously on the same machine,
...@@ -78,32 +106,13 @@ trap cleanup EXIT ...@@ -78,32 +106,13 @@ trap cleanup EXIT
# while workers connect to the head's address. # while workers connect to the head's address.
RAY_START_CMD="ray start --block" RAY_START_CMD="ray start --block"
if [ "${NODE_TYPE}" == "--head" ]; then if [ "${NODE_TYPE}" == "--head" ]; then
RAY_START_CMD+=" --head --port=6379" RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
else else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
fi
# Parse VLLM_HOST_IP from additional args if present. RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings. if [ -n "${VLLM_HOST_IP}" ]; then
VLLM_HOST_IP="" RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
for arg in "${ADDITIONAL_ARGS[@]}"; do
if [[ $arg == "-e" ]]; then
continue
fi
if [[ $arg == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
break
fi fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS=()
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_IP_VARS=(
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
)
fi fi
# Launch the container with the assembled parameters. # Launch the container with the assembled parameters.
...@@ -118,6 +127,5 @@ docker run \ ...@@ -118,6 +127,5 @@ docker run \
--shm-size 10.24g \ --shm-size 10.24g \
--gpus all \ --gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${RAY_IP_VARS[@]}" \
"${ADDITIONAL_ARGS[@]}" \ "${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}" "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment