gpu_utils.sh 10.3 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
# SPDX-License-Identifier: Apache-2.0
#
5
# Shared GPU utility functions for launch scripts (source, don't execute).
6
#
7
# Usage:
8
9
10
11
#   source "$(dirname "$(readlink -f "$0")")/../common/gpu_utils.sh"
#   # or with SCRIPT_DIR already set:
#   source "$SCRIPT_DIR/../common/gpu_utils.sh"
#
12
13
14
15
# Functions (all return via stdout):
#   build_gpu_mem_args <engine> [--workers-per-gpu N]
#       Returns engine-specific CLI args for GPU memory control based on
#       environment variable overrides. Empty if no overrides.
16
#
17
18
19
#       Supported engines: vllm, sglang
#
#       vLLM:   _PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES      → --kv-cache-memory-bytes N --gpu-memory-utilization 0.01
20
#       SGLang: _PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS → --max-total-tokens N
21
#
22
23
24
25
#       Note: TensorRT-LLM uses build_trtllm_override_args_with_mem() instead (requires JSON merging)
#
#       TODO: Split into build_vllm_gpu_mem_args and build_sglang_gpu_mem_args
#
26
# Usage:
27
#   # vLLM / SGLang
28
29
#   GPU_MEM_ARGS=$(build_gpu_mem_args sglang)
#   python -m dynamo.sglang --model-path "$MODEL" $GPU_MEM_ARGS &
30
#
31
32
#   GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
#   python -m dynamo.vllm --model "$MODEL" $GPU_MEM_ARGS &
33
build_gpu_mem_args() {
34
    local engine="${1:?usage: build_gpu_mem_args <engine> [--workers-per-gpu N]}"
35
36
    shift

37
38
39
40
41
42
    # TensorRT-LLM uses build_trtllm_override_args_with_mem instead
    if [[ "$engine" == "trtllm" ]]; then
        echo "build_gpu_mem_args: TensorRT-LLM not supported. Use build_trtllm_override_args_with_mem instead." >&2
        return 1
    fi

43
44
45
    local workers_per_gpu=1
    while [[ $# -gt 0 ]]; do
        case "$1" in
46
            --workers-per-gpu) workers_per_gpu="$2"; shift 2 ;;
47
48
49
50
            *) echo "build_gpu_mem_args: unknown option '$1'" >&2; return 1 ;;
        esac
    done

51
52
53
54
    # --- SGLang: token-based KV cache cap ---
    if [[ "$engine" == "sglang" && -n "${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS:-}" ]]; then
        echo "--max-total-tokens ${_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS}"
        return 0
55
56
    fi

57
58
59
60
61
62
63
64
65
66
67
    # --- vLLM: byte-based KV cache cap ---
    # --gpu-memory-utilization 0.01 prevents vLLM's startup check from rejecting
    # the launch when co-resident tests use >10% of VRAM (vLLM checks free memory
    # against the fraction *before* applying the byte cap).
    if [[ "$engine" == "vllm" && -n "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}" ]]; then
        local kv_bytes="$_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"
        if [[ "$workers_per_gpu" -gt 1 ]]; then
            kv_bytes=$(awk -v b="$kv_bytes" -v n="$workers_per_gpu" 'BEGIN { printf "%d", b / n }')
        fi
        echo "--kv-cache-memory-bytes $kv_bytes --gpu-memory-utilization 0.01"
        return 0
68
69
    fi

70
71
    # No override — engine uses its default allocation
    echo ""
72
73
74
}


75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# ---------------------------------------------------------------------------
# build_trtllm_override_args_with_mem [--merge-with-json JSON]
#   TensorRT-LLM-specific: builds JSON for --override-engine-args with GPU memory config.
#   Returns ONLY the bare JSON value (no --override-engine-args flag, no quotes).
#
#   Separate function because TRT-LLM requires JSON merging for --override-engine-args
#   (unlike vLLM/SGLang which use direct CLI flags).
#
#   Environment variables:
#     _PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS        → {"kv_cache_config": {"max_tokens": N}}
#     _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES → {"kv_cache_config": {"max_gpu_total_bytes": N}}
#
#   If --merge-with-json is provided, merges GPU config with the existing JSON.
#
# Usage:
#   # TensorRT-LLM: simple case (no existing overrides)
#   JSON=$(build_trtllm_override_args_with_mem)
#   python -m dynamo.trtllm --model-path "$MODEL" ${JSON:+--override-engine-args "$JSON"} &
#
#   # TensorRT-LLM: merge with existing JSON
#   EXISTING='{"return_perf_metrics": true}'
#   JSON=$(build_trtllm_override_args_with_mem --merge-with-json "$EXISTING")
#   python -m dynamo.trtllm --model-path "$MODEL" --override-engine-args "$JSON" &
# ---------------------------------------------------------------------------
build_trtllm_override_args_with_mem() {
    local merge_json=""
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --merge-with-json)
                merge_json="$2"
                shift 2
                ;;
            *) echo "build_trtllm_override_args_with_mem: unknown option '$1'" >&2; return 1 ;;
        esac
    done

    local gpu_mem_json=""

    # Token-based (preferred, simpler to reason about)
    if [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS:-}" ]]; then
        gpu_mem_json='"kv_cache_config": {"max_tokens": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS}"'}'
    # Byte-based (alternative, more precise)
    elif [[ -n "${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES:-}" ]]; then
        gpu_mem_json='"kv_cache_config": {"max_gpu_total_bytes": '"${_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES}"'}'
    fi

    if [[ -n "$gpu_mem_json" ]]; then
        if [[ -n "$merge_json" ]]; then
            # Merge: GPU mem config first, then existing config
            # Strip outer braces from existing JSON
            local existing="${merge_json#\{}"
            existing="${existing%\}}"
            if [[ -n "${existing//[[:space:]]/}" ]]; then
                echo "{${gpu_mem_json}, ${existing}}"
            else
                echo "{${gpu_mem_json}}"
            fi
        else
            # Just GPU mem config
            echo "{${gpu_mem_json}}"
        fi
    elif [[ -n "$merge_json" ]]; then
        # No GPU override, return existing JSON as-is
        echo "$merge_json"
    fi

    # No output if both are empty (engine uses default)
}


145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# ---------------------------------------------------------------------------
# Self-test: bash gpu_utils.sh --self-test
# ---------------------------------------------------------------------------
_gpu_utils_self_test() {
    local pass=0 fail=0
    _assert() {
        local label="$1" expected="$2" actual="$3"
        if [[ "$expected" == "$actual" ]]; then
            ((pass++))
            echo "  PASS  $label"
        else
            ((fail++))
            echo "  FAIL  $label  (expected='$expected'  actual='$actual')"
        fi
    }

161
    local result
162

163
164
165
166
    echo "=== vLLM: kv bytes override ==="
    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
        build_gpu_mem_args vllm)
    _assert "kv bytes" "--kv-cache-memory-bytes 942054000 --gpu-memory-utilization 0.01" "$result"
167
168

    echo ""
169
170
171
172
    echo "=== vLLM: kv bytes with --workers-per-gpu 2 ==="
    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
        build_gpu_mem_args vllm --workers-per-gpu 2)
    _assert "kv bytes / 2" "--kv-cache-memory-bytes 471027000 --gpu-memory-utilization 0.01" "$result"
173
174

    echo ""
175
176
177
    echo "=== vLLM: no override = empty ==="
    result=$(build_gpu_mem_args vllm)
    _assert "empty (engine default)" "" "$result"
178
179

    echo ""
180
181
182
183
    echo "=== vLLM: sglang token env ignored ==="
    result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=23824 \
        build_gpu_mem_args vllm)
    _assert "vllm ignores token cap" "" "$result"
184
185

    echo ""
186
187
188
189
    echo "=== sglang: token cap env ==="
    result=$(_PROFILE_OVERRIDE_SGLANG_MAX_TOTAL_TOKENS=1024 \
        build_gpu_mem_args sglang)
    _assert "token cap" "--max-total-tokens 1024" "$result"
190
191

    echo ""
192
193
194
    echo "=== sglang: no override = empty ==="
    result=$(build_gpu_mem_args sglang)
    _assert "empty (engine default)" "" "$result"
195
196

    echo ""
197
198
199
200
    echo "=== sglang: vllm kv bytes env ignored ==="
    result=$(_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES=942054000 \
        build_gpu_mem_args sglang)
    _assert "sglang ignores kv bytes" "" "$result"
201

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
    echo ""
    echo "=== trtllm: token cap env ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=4096 \
        build_trtllm_override_args_with_mem)
    _assert "trtllm token cap" '{"kv_cache_config": {"max_tokens": 4096}}' "$result"

    echo ""
    echo "=== trtllm: byte cap env ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=1073741824 \
        build_trtllm_override_args_with_mem)
    _assert "trtllm byte cap" '{"kv_cache_config": {"max_gpu_total_bytes": 1073741824}}' "$result"

    echo ""
    echo "=== trtllm: no override = empty ==="
    result=$(build_trtllm_override_args_with_mem)
    _assert "empty (engine default)" "" "$result"

    echo ""
    echo "=== trtllm: token cap takes precedence over byte cap ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 _PROFILE_OVERRIDE_TRTLLM_MAX_GPU_TOTAL_BYTES=999999 \
        build_trtllm_override_args_with_mem)
    _assert "trtllm token precedence" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"

    echo ""
    echo "=== trtllm: merge with existing JSON ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
        build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}')
    _assert "trtllm merged" '{"kv_cache_config": {"max_tokens": 2048}, "return_perf_metrics": true, "otlp_traces_endpoint": "http://localhost:4317"}' "$result"

    echo ""
    echo "=== trtllm: merge with empty JSON object ==="
    result=$(_PROFILE_OVERRIDE_TRTLLM_MAX_TOTAL_TOKENS=2048 \
        build_trtllm_override_args_with_mem --merge-with-json '{}')
    _assert "trtllm merge empty obj" '{"kv_cache_config": {"max_tokens": 2048}}' "$result"

    echo ""
    echo "=== trtllm: no GPU override, but pass through existing JSON ==="
    result=$(build_trtllm_override_args_with_mem --merge-with-json '{"return_perf_metrics": true}')
    _assert "trtllm passthrough" '{"return_perf_metrics": true}' "$result"

242
    echo ""
243
244
245
    echo "=== missing engine ==="
    (build_gpu_mem_args 2>/dev/null)
    _assert "missing engine exits non-zero" "1" "$?"
246

247
248
249
250
251
    echo ""
    echo "=== trtllm rejected (use build_trtllm_override_args_with_mem) ==="
    (build_gpu_mem_args trtllm 2>/dev/null)
    _assert "trtllm rejected" "1" "$?"

252
253
254
255
256
257
258
    echo ""
    echo "=========================================="
    echo "Results: $pass passed, $fail failed"
    echo "=========================================="
    [[ "$fail" -eq 0 ]]
}

259
260
261
262
# Self-test: source this file then call _gpu_utils_self_test
if [[ "${BASH_SOURCE[0]}" == "$0" && "${1:-}" == "--self-test" ]]; then
    _gpu_utils_self_test
    exit $?
263
fi