run.sh 10.3 KB
Newer Older
Carsten Csiky's avatar
Carsten Csiky committed
1
#!/usr/bin/env bash
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Carsten Csiky's avatar
Carsten Csiky committed
17
18
set -e

19
20
21
22
23
24
25
26
RUN_PREFIX=

# Frameworks
#
# Each framework has a corresponding base image.  Additional
# dependencies are specified in the /container/deps folder and
# installed within framework specific sections of the Dockerfile.

27
declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["NONE"]=3 ["SGLANG"]=4)
Ryan Olson's avatar
Ryan Olson committed
28

29
DEFAULT_FRAMEWORK=VLLM
30
31
32
33

SOURCE_DIR=$(dirname "$(readlink -f "$0")")

IMAGE=
34
TARGET="dev"
35
36
37
38
39
40
HF_CACHE=
DEFAULT_HF_CACHE=${SOURCE_DIR}/.cache/huggingface
GPUS="all"
PRIVILEGED=
VOLUME_MOUNTS=
MOUNT_WORKSPACE=
41
DEV_MODE=
42
43
44
ENVIRONMENT_VARIABLES=
REMAINING_ARGS=
INTERACTIVE=
45
USE_NIXL_GDS=
46
47
RUNTIME=nvidia
WORKDIR=/workspace
48
49
50
51
52
53
54
55

get_options() {
    while :; do
        case $1 in
        -h | -\? | --help)
            show_help
            exit
            ;;
56
        --framework)
57
58
59
60
            if [ "$2" ]; then
                FRAMEWORK=$2
                shift
            else
61
                missing_requirement "$1"
62
63
64
65
66
67
68
            fi
            ;;
        --image)
            if [ "$2" ]; then
                IMAGE=$2
                shift
            else
69
                missing_requirement "$1"
70
71
            fi
            ;;
72
73
74
75
76
        --target)
            if [ "$2" ]; then
                TARGET=$2
                shift
            else
Carsten Csiky's avatar
Carsten Csiky committed
77
                missing_requirement "$1"
78
79
            fi
            ;;
80
        --name)
81
82
83
84
            if [ "$2" ]; then
                NAME=$2
                shift
            else
85
                missing_requirement "$1"
86
87
            fi
            ;;
88
        --hf-cache)
89
90
91
92
            if [ "$2" ]; then
                HF_CACHE=$2
                shift
            else
93
                missing_requirement "$1"
94
95
96
            fi
            ;;

97
        --gpus)
98
99
100
101
            if [ "$2" ]; then
                GPUS=$2
                shift
            else
102
                missing_requirement "$1"
103
104
            fi
            ;;
105
        --runtime)
106
107
108
109
            if [ "$2" ]; then
                RUNTIME=$2
                shift
            else
110
                missing_requirement "$1"
111
112
            fi
            ;;
113
        --entrypoint)
114
            if [ "$2" ]; then
115
                ENTRYPOINT=$2
116
117
                shift
            else
118
119
120
121
122
123
124
125
126
                missing_requirement "$1"
            fi
            ;;
        --workdir)
            if [ "$2" ]; then
                WORKDIR="$2"
                shift
            else
                missing_requirement "$1"
127
128
            fi
            ;;
129
        --privileged)
130
131
132
133
            if [ "$2" ]; then
                PRIVILEGED=$2
                shift
            else
134
                missing_requirement "$1"
135
136
            fi
            ;;
137
        --rm)
138
139
140
141
            if [ "$2" ]; then
                RM=$2
                shift
            else
142
                missing_requirement "$1"
143
144
            fi
            ;;
145
        -v)
146
147
148
149
            if [ "$2" ]; then
                VOLUME_MOUNTS+=" -v $2 "
                shift
            else
150
                missing_requirement "$1"
151
152
            fi
            ;;
153
        -e)
154
155
156
157
            if [ "$2" ]; then
                ENVIRONMENT_VARIABLES+=" -e $2 "
                shift
            else
158
                missing_requirement "$1"
159
160
            fi
            ;;
161
162
163
164
165
166
        -it)
            INTERACTIVE=" -it "
            ;;
        --mount-workspace)
            MOUNT_WORKSPACE=TRUE
            ;;
167

168
169
170
        --use-nixl-gds)
            USE_NIXL_GDS=TRUE
            ;;
171
172
173
174
175
176
177
178
179
180
181
182
183
        --dry-run)
            RUN_PREFIX="echo"
            echo ""
            echo "=============================="
            echo "DRY RUN: COMMANDS PRINTED ONLY"
            echo "=============================="
            echo ""
            ;;
        --)
            shift
            break
            ;;
         -?*)
184
            error 'ERROR: Unknown option: ' "$1"
185
            ;;
186
187
         ?*)
            error 'ERROR: Unknown option: ' "$1"
188
189
190
191
192
193
194
195
196
197
            ;;
        *)
            break
            ;;
        esac

        shift
    done

    if [ -z "$FRAMEWORK" ]; then
198
        FRAMEWORK=$DEFAULT_FRAMEWORK
199
200
    fi

Carsten Csiky's avatar
Carsten Csiky committed
201
    if [ -n "$FRAMEWORK" ]; then
202
203
204
205
        FRAMEWORK=${FRAMEWORK^^}
        if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then
            error 'ERROR: Unknown framework: ' "$FRAMEWORK"
        fi
206
207
208
    fi

    if [ -z "$IMAGE" ]; then
209
        IMAGE="dynamo:latest-${FRAMEWORK,,}"
Carsten Csiky's avatar
Carsten Csiky committed
210
        if [ -n "${TARGET}" ]; then
211
212
            IMAGE="${IMAGE}-${TARGET}"
        fi
213
214
215
    fi

    if [[ ${GPUS^^} == "NONE" ]]; then
216
        GPU_STRING=""
217
    else
218
        GPU_STRING="--gpus ${GPUS}"
219
220
221
    fi

    if [[ ${NAME^^} == "" ]]; then
222
        NAME_STRING=""
223
    else
224
        NAME_STRING="--name ${NAME}"
225
226
    fi

227
    if [[ ${ENTRYPOINT^^} == "" ]]; then
228
        ENTRYPOINT_STRING=""
229
    else
230
        ENTRYPOINT_STRING="--entrypoint ${ENTRYPOINT}"
231
232
    fi

233
    if [[ ${HF_CACHE^^} == "NONE" ]]; then
234
        HF_CACHE=
235
236
    fi

237
    # HF_CACHE mounting will be handled in workspace section
238

Carsten Csiky's avatar
Carsten Csiky committed
239
    if [ -z "${PRIVILEGED}" ]; then
240
        PRIVILEGED="FALSE"
241
242
    fi

Carsten Csiky's avatar
Carsten Csiky committed
243
    if [ -z "${RM}" ]; then
244
        RM="TRUE"
245
246
    fi

247
248
249
    # Initialize PRIVILEGED_STRING
    PRIVILEGED_STRING=""
    if [[ ${PRIVILEGED^^} != "FALSE" ]]; then
250
        PRIVILEGED_STRING="--privileged"
251
252
    fi

253
    if [[ ${RM^^} == "FALSE" ]]; then
254
        RM_STRING=""
255
    else
256
        RM_STRING=" --rm "
257
258
    fi

259
260
261
    if [ -n "$USE_NIXL_GDS" ]; then
        VOLUME_MOUNTS+=" -v /run/udev:/run/udev:ro "
        NIXL_GDS_CAPS="--cap-add=IPC_LOCK"
Ryan Olson's avatar
Ryan Olson committed
262
263
264
265
266
267
268
        # NOTE(jthomson04): In the KVBM disk pools, we currently allocate our files in /tmp.
        # For some arcane reason, GDS requires that /tmp be mounted.
        # This is already handled for us if we set --mount-workspace
        # If we aren't mounting our workspace but need GDS, we need to mount /tmp.
        if [ -z "$MOUNT_WORKSPACE" ]; then
            VOLUME_MOUNTS+=" -v /tmp:/tmp "
        fi
269
270
271
    else
        NIXL_GDS_CAPS=""
    fi
272
    if [[ "$GPUS" == "none" || "$GPUS" == "NONE" ]]; then
273
            RUNTIME=""
274
    fi
275
276
277
278
279
280
281
282
283
284

    # Auto-enable DEV_MODE for vllm dev images
    # TODO(keivenc): Currently only Dockerfile.vllm has proper permissions to run as ubuntu user.
    # Other Dockerfiles (trtllm, sglang, etc.) still require root access.
    if [[ "$IMAGE" == *"-vllm-dev" ]]; then
        DEV_MODE=TRUE
        MOUNT_WORKSPACE=TRUE
        # Interactive mode is implied when MOUNT_WORKSPACE is TRUE
    fi

285
286
287
288
289
290
    REMAINING_ARGS=("$@")
}

show_help() {
    echo "usage: run.sh"
    echo "  [--image image]"
Carsten Csiky's avatar
Carsten Csiky committed
291
    echo "  [--framework framework one of ${!FRAMEWORKS[*]}]"
292
293
    echo "  [--target target stage to use, default is 'dev']"
    echo "  [--name name for launched container, default NONE]"
294
295
296
297
    echo "  [--privileged whether to launch in privileged mode, default FALSE unless mounting workspace]"
    echo "  [--dry-run print docker commands without running]"
    echo "  [--hf-cache directory to volume mount as the hf cache, default is NONE unless mounting workspace]"
    echo "  [--gpus gpus to enable, default is 'all', 'none' disables gpu support]"
298
    echo "  [--use-nixl-gds add volume mounts and capabilities needed for NVIDIA GPUDirect Storage]"
299
300
301
302
    echo "  [-v add volume mount]"
    echo "  [-e add environment variable]"
    echo "  [--mount-workspace set up for local development]"
    echo "  [-- stop processing and pass remaining args as command to docker run]"
303
304
    echo "  [--workdir set the working directory inside the container]"
    echo "  [--runtime add runtime variables]"
305
306
    echo "  [--entrypoint override container entrypoint]"
    echo "  [-h, --help show this help]"
307
308
309
310
311
312
313
314
315
316
317
318
319
320
    exit 0
}

missing_requirement() {
    error "ERROR: $1 requires an argument."
}

error() {
    printf '%s %s\n' "$1" "$2" >&2
    exit 1
}

get_options "$@"

321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# Process workspace mounting after auto-detection
if [ -n "$MOUNT_WORKSPACE" ]; then
    HOME_PATH="/home/ubuntu"

    # Common workspace setup
    VOLUME_MOUNTS+=" -v $(dirname "${SOURCE_DIR}"):/workspace "
    VOLUME_MOUNTS+=" -v /tmp:/tmp "
    VOLUME_MOUNTS+=" -v /mnt/:/mnt "
    WORKDIR=/workspace
    INTERACTIVE=" -it "

    # Set default HF_CACHE if not specified
    if [ -z "$HF_CACHE" ]; then
        HF_CACHE=$DEFAULT_HF_CACHE
    fi
336

337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    # Environment variables for all workspace modes
    ENVIRONMENT_VARIABLES+=" -e HF_TOKEN"
    ENVIRONMENT_VARIABLES+=" -e GITHUB_TOKEN"
    ENVIRONMENT_VARIABLES+=" -e HOME=$HOME_PATH"

    # Mount HF_CACHE to user's home cache directory
    if [ -n "$HF_CACHE" ]; then
        mkdir -p "$HF_CACHE"
        VOLUME_MOUNTS+=" -v $HF_CACHE:$HOME_PATH/.cache/huggingface"
    fi

    if [ -n "$DEV_MODE" ]; then
        # Dev Container-specific setup - the Dockerfile handles UID/GID mapping via build args
        # This currently only works with Dockerfile.vllm which has proper ubuntu user setup.
        echo "Dev Container mode enabled - using ubuntu user with host UID/GID"
        # Use ubuntu user (with correct UID/GID baked into image)
        PRIVILEGED_STRING+=" --user ubuntu"
    else
        # Standard workspace mode - enable privileged mode
        # TODO(keivenc): Security risk, remove soon. Dockerfiles (trtllm, sglang) still need to run as root.
        if [ -z "${PRIVILEGED}" ]; then
            PRIVILEGED_STRING="--privileged"
        fi
    fi
fi

# RUN the image
364
365
366
367
if [ -z "$RUN_PREFIX" ]; then
    set -x
fi

368
369
370
371
372
${RUN_PREFIX} docker run \
    ${GPU_STRING} \
    ${INTERACTIVE} \
    ${RM_STRING} \
    --network host \
373
    ${RUNTIME:+--runtime "$RUNTIME"} \
374
375
376
377
378
379
    --shm-size=10G \
    --ulimit memlock=-1 \
    --ulimit stack=67108864 \
    --ulimit nofile=65536:65536 \
    ${ENVIRONMENT_VARIABLES} \
    ${VOLUME_MOUNTS} \
380
    -w "$WORKDIR" \
381
    --cap-add CAP_SYS_PTRACE \
382
    ${NIXL_GDS_CAPS} \
383
384
385
386
387
388
    --ipc host \
    ${PRIVILEGED_STRING} \
    ${NAME_STRING} \
    ${ENTRYPOINT_STRING} \
    ${IMAGE} \
    "${REMAINING_ARGS[@]}"
389
390

{ set +x; } 2>/dev/null