"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "57bdbdf1eec917ca74a3b76bc15c90cc9a2cbc9e"
Unverified Commit ae7e08a3 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Fix NATS_SERVER value, add details on customizing MOUNTS (#1520)

parent 75503dae
...@@ -68,6 +68,25 @@ inside an interactive shell on one of the allocated nodes: ...@@ -68,6 +68,25 @@ inside an interactive shell on one of the allocated nodes:
# https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker # https://github.com/ai-dynamo/dynamo/tree/main/examples/tensorrt_llm#build-docker
export IMAGE="<dynamo_trtllm_image>" export IMAGE="<dynamo_trtllm_image>"
# MOUNTS are the host:container path pairs that are mounted into the containers
# launched by each `srun` command.
#
# If you want to reference files, such as $MODEL_PATH below, in a
# different location, you can customize MOUNTS or specify additional
# comma-separated mount pairs here.
#
# NOTE: Currently, this example assumes that the local bash scripts and configs
# referenced are mounted into into /mnt inside the container. If you want to
# customize the location of the scripts, make sure to modify `srun_script.sh`
# accordingly for the new locations of `start_frontend_services.sh` and
# `start_trtllm_worker.sh`.
#
# For example, assuming your cluster had a `/lustre` directory on the host, you
# could add that as a mount like so:
#
# export MOUNTS="${PWD}:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}:/mnt"
# NOTE: In general, Deepseek R1 is very large, so it is recommended to # NOTE: In general, Deepseek R1 is very large, so it is recommended to
# pre-download the model weights and save them in some shared location, # pre-download the model weights and save them in some shared location,
# NFS storage, HF_CACHE, etc. and modify the `--model-path` below # NFS storage, HF_CACHE, etc. and modify the `--model-path` below
......
...@@ -10,7 +10,8 @@ IMAGE="${IMAGE:-""}" ...@@ -10,7 +10,8 @@ IMAGE="${IMAGE:-""}"
# but you may freely customize the mounts based on your cluster. A common practice # but you may freely customize the mounts based on your cluster. A common practice
# is to mount paths to NFS storage for common scripts, model weights, etc. # is to mount paths to NFS storage for common scripts, model weights, etc.
# NOTE: This can be a comma separated list of multiple mounts as well. # NOTE: This can be a comma separated list of multiple mounts as well.
MOUNTS="$PWD:/mnt" DEFAULT_MOUNT="${PWD}:/mnt"
MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
# Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes.
# For 8xH100 nodes as an example, you may set this to 2 nodes x 16 gpus, or 4 nodes x 32 gpus instead. # For 8xH100 nodes as an example, you may set this to 2 nodes x 16 gpus, or 4 nodes x 32 gpus instead.
...@@ -23,7 +24,7 @@ ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)" ...@@ -23,7 +24,7 @@ ACCOUNT="$(sacctmgr -nP show assoc where user=$(whoami) format=account)"
export HEAD_NODE="${SLURMD_NODENAME}" export HEAD_NODE="${SLURMD_NODENAME}"
export HEAD_NODE_IP="$(hostname -i)" export HEAD_NODE_IP="$(hostname -i)"
export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379" export ETCD_ENDPOINTS="${HEAD_NODE_IP}:2379"
export NATS_SERVER="${HEAD_NODE_IP}:4222" export NATS_SERVER="nats://${HEAD_NODE_IP}:4222"
if [[ -z ${IMAGE} ]]; then if [[ -z ${IMAGE} ]]; then
echo "ERROR: You need to set the IMAGE environment variable to the " \ echo "ERROR: You need to set the IMAGE environment variable to the " \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment