Unverified Commit 77d702a2 authored by Ev Lacey's avatar Ev Lacey Committed by GitHub
Browse files

Enhance run_cluster.sh for multi-NIC support (#28328)


Signed-off-by: default avatarEv Lacey <elacey@nvidia.com>
parent 2108a571
...@@ -83,6 +83,29 @@ else ...@@ -83,6 +83,29 @@ else
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379" RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
fi fi
# Parse VLLM_HOST_IP from additional args if present.
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
VLLM_HOST_IP=""
for arg in "${ADDITIONAL_ARGS[@]}"; do
if [[ $arg == "-e" ]]; then
continue
fi
if [[ $arg == VLLM_HOST_IP=* ]]; then
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
break
fi
done
# Build Ray IP environment variables if VLLM_HOST_IP is set.
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
RAY_IP_VARS=()
if [ -n "${VLLM_HOST_IP}" ]; then
RAY_IP_VARS=(
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
)
fi
# Launch the container with the assembled parameters. # Launch the container with the assembled parameters.
# --network host: Allows Ray nodes to communicate directly via host networking # --network host: Allows Ray nodes to communicate directly via host networking
# --shm-size 10.24g: Increases shared memory # --shm-size 10.24g: Increases shared memory
...@@ -95,5 +118,6 @@ docker run \ ...@@ -95,5 +118,6 @@ docker run \
--shm-size 10.24g \ --shm-size 10.24g \
--gpus all \ --gpus all \
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
"${RAY_IP_VARS[@]}" \
"${ADDITIONAL_ARGS[@]}" \ "${ADDITIONAL_ARGS[@]}" \
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}" "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment