run.sh 10.9 KB
Newer Older
Carsten Csiky's avatar
Carsten Csiky committed
1
#!/usr/bin/env bash
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Carsten Csiky's avatar
Carsten Csiky committed
17
18
set -e

19
20
21
22
23
24
25
26
RUN_PREFIX=

# Frameworks
#
# Each framework has a corresponding base image.  Additional
# dependencies are specified in the /container/deps folder and
# installed within framework specific sections of the Dockerfile.

27
declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["NONE"]=3 ["SGLANG"]=4)
Ryan Olson's avatar
Ryan Olson committed
28

29
DEFAULT_FRAMEWORK=VLLM
30
31
32
33

SOURCE_DIR=$(dirname "$(readlink -f "$0")")

IMAGE=
34
35
HF_HOME=${HF_HOME:-}
DEFAULT_HF_HOME=${SOURCE_DIR}/.cache/huggingface
36
37
38
GPUS="all"
PRIVILEGED=
VOLUME_MOUNTS=
39
PORT_MAPPINGS=
40
41
42
43
MOUNT_WORKSPACE=
ENVIRONMENT_VARIABLES=
REMAINING_ARGS=
INTERACTIVE=
44
USE_NIXL_GDS=
45
46
RUNTIME=nvidia
WORKDIR=/workspace
47
NETWORK=host
48
USER=
49
GROUP_ADD_STRING=
50
51
52
53
54
55
56
57

get_options() {
    while :; do
        case $1 in
        -h | -\? | --help)
            show_help
            exit
            ;;
58
        --framework)
59
60
61
62
            if [ "$2" ]; then
                FRAMEWORK=$2
                shift
            else
63
                missing_requirement "$1"
64
65
66
67
68
69
70
            fi
            ;;
        --image)
            if [ "$2" ]; then
                IMAGE=$2
                shift
            else
71
                missing_requirement "$1"
72
73
            fi
            ;;
74
75
76
77
78
        --target)
            if [ "$2" ]; then
                TARGET=$2
                shift
            else
Carsten Csiky's avatar
Carsten Csiky committed
79
                missing_requirement "$1"
80
81
            fi
            ;;
82
        --name)
83
84
85
86
            if [ "$2" ]; then
                NAME=$2
                shift
            else
87
                missing_requirement "$1"
88
89
            fi
            ;;
90
        --hf-cache|--hf-home)
91
            if [ "$2" ]; then
92
                HF_HOME=$2
93
94
                shift
            else
95
                missing_requirement "$1"
96
97
98
            fi
            ;;

99
        --gpus)
100
101
102
103
            if [ "$2" ]; then
                GPUS=$2
                shift
            else
104
                missing_requirement "$1"
105
106
            fi
            ;;
107
        --runtime)
108
109
110
111
            if [ "$2" ]; then
                RUNTIME=$2
                shift
            else
112
                missing_requirement "$1"
113
114
            fi
            ;;
115
        --entrypoint)
116
            if [ "$2" ]; then
117
                ENTRYPOINT=$2
118
119
                shift
            else
120
121
122
123
124
125
126
127
128
                missing_requirement "$1"
            fi
            ;;
        --workdir)
            if [ "$2" ]; then
                WORKDIR="$2"
                shift
            else
                missing_requirement "$1"
129
130
            fi
            ;;
131
        --privileged)
132
133
134
135
            if [ "$2" ]; then
                PRIVILEGED=$2
                shift
            else
136
                missing_requirement "$1"
137
138
            fi
            ;;
139
        --rm)
140
141
142
143
            if [ "$2" ]; then
                RM=$2
                shift
            else
144
                missing_requirement "$1"
145
146
            fi
            ;;
147
        -v)
148
149
150
151
            if [ "$2" ]; then
                VOLUME_MOUNTS+=" -v $2 "
                shift
            else
152
                missing_requirement "$1"
153
154
            fi
            ;;
155
156
157
158
159
160
161
162
        -p|--port)
            if [ "$2" ]; then
                PORT_MAPPINGS+=" -p $2 "
                shift
            else
                missing_requirement "$1"
            fi
            ;;
163
        -e)
164
165
166
167
            if [ "$2" ]; then
                ENVIRONMENT_VARIABLES+=" -e $2 "
                shift
            else
168
                missing_requirement "$1"
169
170
            fi
            ;;
171
172
173
174
175
176
        -it)
            INTERACTIVE=" -it "
            ;;
        --mount-workspace)
            MOUNT_WORKSPACE=TRUE
            ;;
177
178
179
        --use-nixl-gds)
            USE_NIXL_GDS=TRUE
            ;;
180
181
182
183
184
185
186
187
        --network)
            if [ "$2" ]; then
                NETWORK=$2
                shift
            else
                missing_requirement "$1"
            fi
            ;;
188
189
190
191
192
193
194
195
        --user)
            if [ "$2" ]; then
                USER=$2
                shift
            else
                missing_requirement "$1"
            fi
            ;;
196
197
198
199
200
201
202
203
204
205
206
207
208
        --dry-run)
            RUN_PREFIX="echo"
            echo ""
            echo "=============================="
            echo "DRY RUN: COMMANDS PRINTED ONLY"
            echo "=============================="
            echo ""
            ;;
        --)
            shift
            break
            ;;
         -?*)
209
            error 'ERROR: Unknown option: ' "$1"
210
            ;;
211
212
         ?*)
            error 'ERROR: Unknown option: ' "$1"
213
214
215
216
217
218
219
220
221
222
            ;;
        *)
            break
            ;;
        esac

        shift
    done

    if [ -z "$FRAMEWORK" ]; then
223
        FRAMEWORK=$DEFAULT_FRAMEWORK
224
225
    fi

Carsten Csiky's avatar
Carsten Csiky committed
226
    if [ -n "$FRAMEWORK" ]; then
227
228
229
230
        FRAMEWORK=${FRAMEWORK^^}
        if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then
            error 'ERROR: Unknown framework: ' "$FRAMEWORK"
        fi
231
232
233
    fi

    if [ -z "$IMAGE" ]; then
234
        IMAGE="dynamo:latest-${FRAMEWORK,,}"
Carsten Csiky's avatar
Carsten Csiky committed
235
        if [ -n "${TARGET}" ]; then
236
237
            IMAGE="${IMAGE}-${TARGET}"
        fi
238
239
240
    fi

    if [[ ${GPUS^^} == "NONE" ]]; then
241
        GPU_STRING=""
242
    else
243
        GPU_STRING="--gpus ${GPUS}"
244
245
246
    fi

    if [[ ${NAME^^} == "" ]]; then
247
        NAME_STRING=""
248
    else
249
        NAME_STRING="--name ${NAME}"
250
251
    fi

252
    if [[ ${ENTRYPOINT^^} == "" ]]; then
253
        ENTRYPOINT_STRING=""
254
    else
255
        ENTRYPOINT_STRING="--entrypoint ${ENTRYPOINT}"
256
257
    fi

258
259
260
261
262
    if [ -n "$MOUNT_WORKSPACE" ]; then
        VOLUME_MOUNTS+=" -v ${SOURCE_DIR}/..:/workspace "
        VOLUME_MOUNTS+=" -v /tmp:/tmp "
        VOLUME_MOUNTS+=" -v /mnt/:/mnt "

263
264
        if [ -z "$HF_HOME" ]; then
            HF_HOME=$DEFAULT_HF_HOME
265
266
267
268
269
270
271
272
273
        fi

        if [ -z "${PRIVILEGED}" ]; then
            PRIVILEGED="TRUE"
        fi

        ENVIRONMENT_VARIABLES+=" -e HF_TOKEN"
    fi

274
275
    if [[ ${HF_HOME^^} == "NONE" ]]; then
        HF_HOME=
276
277
    fi

278
279
    if [ -n "$HF_HOME" ]; then
        mkdir -p "$HF_HOME"
280
        if [[ ${USER} == "root" ]] || [[ ${USER} == "0" ]]; then
281
            HF_HOME_TARGET="/root/.cache/huggingface"
282
283
        else
            HF_HOME_TARGET="/home/dynamo/.cache/huggingface"
284
        fi
285
        VOLUME_MOUNTS+=" -v $HF_HOME:$HF_HOME_TARGET"
286
    fi
287

Carsten Csiky's avatar
Carsten Csiky committed
288
    if [ -z "${PRIVILEGED}" ]; then
289
        PRIVILEGED="FALSE"
290
291
    fi

Carsten Csiky's avatar
Carsten Csiky committed
292
    if [ -z "${RM}" ]; then
293
        RM="TRUE"
294
295
    fi

296
297
298
    if [[ ${PRIVILEGED^^} == "FALSE" ]]; then
        PRIVILEGED_STRING=""
    else
299
        PRIVILEGED_STRING="--privileged"
300
301
    fi

302
    if [[ ${RM^^} == "FALSE" ]]; then
303
        RM_STRING=""
304
    else
305
        RM_STRING=" --rm "
306
307
    fi

308
309
310
    if [ -n "$USE_NIXL_GDS" ]; then
        VOLUME_MOUNTS+=" -v /run/udev:/run/udev:ro "
        NIXL_GDS_CAPS="--cap-add=IPC_LOCK"
Ryan Olson's avatar
Ryan Olson committed
311
312
313
314
315
316
317
        # NOTE(jthomson04): In the KVBM disk pools, we currently allocate our files in /tmp.
        # For some arcane reason, GDS requires that /tmp be mounted.
        # This is already handled for us if we set --mount-workspace
        # If we aren't mounting our workspace but need GDS, we need to mount /tmp.
        if [ -z "$MOUNT_WORKSPACE" ]; then
            VOLUME_MOUNTS+=" -v /tmp:/tmp "
        fi
318
319
320
    else
        NIXL_GDS_CAPS=""
    fi
321
    if [[ "$GPUS" == "none" || "$GPUS" == "NONE" ]]; then
322
            RUNTIME=""
323
    fi
324

325
326
327
328
329
330
    if [[ ${USER} == "" ]]; then
        USER_STRING=""
    else
        USER_STRING="--user ${USER}"
    fi

331
332
333
334
335
336
337
338
339
340
341
342
    # If we override the user, Docker drops supplementary groups from the image.
    # Add root group (GID 0) back so group-writable directories owned by root remain writable,
    # avoiding expensive `chown -R ...` fixes on large mounted workspaces.
    GROUP_ADD_STRING=""
    if [[ -n "${USER}" ]]; then
        # Extract just the UID part (before any colon)
        USER_UID="${USER%%:*}"
        if [[ "${USER_UID}" != "root" && "${USER_UID}" != "0" ]]; then
            GROUP_ADD_STRING="--group-add 0"
        fi
    fi

343
344
345
346
347
348
    REMAINING_ARGS=("$@")
}

show_help() {
    echo "usage: run.sh"
    echo "  [--image image]"
Carsten Csiky's avatar
Carsten Csiky committed
349
    echo "  [--framework framework one of ${!FRAMEWORKS[*]}]"
350
    echo "  [--name name for launched container, default NONE]"
351
352
    echo "  [--privileged whether to launch in privileged mode, default FALSE unless mounting workspace]"
    echo "  [--dry-run print docker commands without running]"
353
    echo "  [--hf-home|--hf-cache directory to volume mount as the hf home, default is NONE unless mounting workspace]"
354
    echo "  [--gpus gpus to enable, default is 'all', 'none' disables gpu support]"
355
    echo "  [--use-nixl-gds add volume mounts and capabilities needed for NVIDIA GPUDirect Storage]"
356
357
358
359
    echo "  [--network network mode for container, default is 'host']"
    echo "           Options: 'host' (default), 'bridge', 'none', 'container:name'"
    echo "           Examples: --network bridge (isolated), --network none (no network - WARNING: breaks most functionality)"
    echo "                    --network container:redis (share network with 'redis' container)"
360
361
    echo "  [--user <name|uid>[:<group|gid>] specify user to run container as]"
    echo "           Format: username or numeric UID, optionally with group/GID (e.g., 'root', '0', '1000:0')"
362
    echo "  [-v add volume mount]"
363
    echo "  [-p|--port add port mapping (host_port:container_port)]"
364
365
366
    echo "  [-e add environment variable]"
    echo "  [--mount-workspace set up for local development]"
    echo "  [-- stop processing and pass remaining args as command to docker run]"
367
368
    echo "  [--workdir set the working directory inside the container]"
    echo "  [--runtime add runtime variables]"
369
370
    echo "  [--entrypoint override container entrypoint]"
    echo "  [-h, --help show this help]"
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    exit 0
}

missing_requirement() {
    error "ERROR: $1 requires an argument."
}

error() {
    printf '%s %s\n' "$1" "$2" >&2
    exit 1
}

get_options "$@"

385
# RUN the image
386
387
388
389
if [ -z "$RUN_PREFIX" ]; then
    set -x
fi

390
391
392
393
${RUN_PREFIX} docker run \
    ${GPU_STRING} \
    ${INTERACTIVE} \
    ${RM_STRING} \
394
    --network "$NETWORK" \
395
    ${RUNTIME:+--runtime "$RUNTIME"} \
396
397
398
399
400
401
    --shm-size=10G \
    --ulimit memlock=-1 \
    --ulimit stack=67108864 \
    --ulimit nofile=65536:65536 \
    ${ENVIRONMENT_VARIABLES} \
    ${VOLUME_MOUNTS} \
402
    ${PORT_MAPPINGS} \
403
    -w "$WORKDIR" \
404
    --cap-add CAP_SYS_PTRACE \
405
    ${NIXL_GDS_CAPS} \
406
407
    --ipc host \
    ${PRIVILEGED_STRING} \
408
    ${USER_STRING} \
409
    ${GROUP_ADD_STRING} \
410
411
412
413
    ${NAME_STRING} \
    ${ENTRYPOINT_STRING} \
    ${IMAGE} \
    "${REMAINING_ARGS[@]}"
414
415

{ set +x; } 2>/dev/null