"vllm/vscode:/vscode.git/clone" did not exist on "9532c49836ad9b5f2120ebba8caf0c56f998126f"
run.sh 8.19 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

17
18
19
set -euo pipefail
IFS=$'\n\t'

20
21
22
RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )"
# Default values
NAMESPACE="${NAMESPACE:-dynamo}"
23
24
DEPLOY_TYPE=""
GAIE="${GAIE:-false}"
25
DEPLOYMENT=""
26
27
28
29
30
31
32
33
34
35
MODEL=""
FRAMEWORK=""
DRY_RUN=""

# Frameworks - following container/build.sh pattern
declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["SGLANG"]=3)
DEFAULT_FRAMEWORK=VLLM

# Function to show usage
usage() {
36
    echo "Usage: $0 [OPTIONS] --model <model> --framework <framework> --deployment <deployment-type>"
37
38
    echo ""
    echo "Required Options:"
39
40
41
    echo "  --model <model>       Model name (e.g., llama-3-70b)"
    echo "  --framework <fw>      Framework one of ${!FRAMEWORKS[*]} (default: ${DEFAULT_FRAMEWORK})"
    echo "  --deployment <type>   Deployment type (e.g., agg, disagg etc, please refer to the README.md for available deployment types)"
42
43
    echo ""
    echo "Optional:"
44
45
46
47
    echo "  --namespace <ns>   Kubernetes namespace (default: dynamo)"
    echo "  --dry-run          Print commands without executing them"
    echo "  --gaie[=true|false] Enable GAIE integration subfolder (applies GAIE manifests skips benchmark) (default: ${GAIE})"
    echo "  -h, --help         Show this help message"
48
49
    echo ""
    echo "Environment Variables:"
50
    echo "  NAMESPACE             Kubernetes namespace (default: dynamo)"
51
52
    echo ""
    echo "Examples:"
53
54
55
    echo "  $0 --model llama-3-70b --framework vllm --deployment agg"
    echo "  $0 --model llama-3-70b --framework trtllm --deployment disagg-single-node"
    echo "  $0 --namespace my-ns --model llama-3-70b --framework vllm --deployment disagg-multi-node"
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    exit 1
}

missing_requirement() {
    echo "ERROR: $1 requires an argument."
    usage
}

error() {
    printf '%s %s\n' "$1" "$2" >&2
    exit 1
}

while [[ $# -gt 0 ]]; do
    case $1 in
        --dry-run)
            DRY_RUN="echo"
            shift
            ;;
        --model)
            if [ "$2" ]; then
                MODEL=$2
                shift 2
            else
                missing_requirement "$1"
            fi
            ;;
        --framework)
            if [ "$2" ]; then
                FRAMEWORK=$2
                shift 2
            else
                missing_requirement "$1"
            fi
            ;;
91
92
93
94
95
96
97
98
        --deployment)
            if [ "$2" ]; then
                DEPLOYMENT=$2
                shift 2
            else
                missing_requirement "$1"
            fi
            ;;
99
100
101
102
103
104
105
106
        --namespace)
            if [ "$2" ]; then
                NAMESPACE=$2
                shift 2
            else
                missing_requirement "$1"
            fi
            ;;
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
        --gaie)
            GAIE=true
            shift
            ;;
        --gaie=false)
            GAIE=false
            shift
            ;;
        --gaie=*)
            GAIE="${1#*=}"
            case "${GAIE,,}" in
              true|false) GAIE="${GAIE,,}";;
              *) echo "ERROR: --gaie must be true or false"; exit 1;;
            esac
            shift
            ;;
123
124
125
126
127
128
129
        -h|--help)
            usage
            ;;
        -*)
            error 'ERROR: Unknown option: ' "$1"
            ;;
        *)
130
            error "ERROR: Unknown argument: " "$1"
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
            ;;
    esac
done

if [ -z "$FRAMEWORK" ]; then
    FRAMEWORK=$DEFAULT_FRAMEWORK
fi

if [ -n "$FRAMEWORK" ]; then
    FRAMEWORK=${FRAMEWORK^^}
    if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then
        error 'ERROR: Unknown framework: ' "$FRAMEWORK"
    fi
fi

# Validate required arguments
147
if [[ -z "$MODEL" ]] || [[ -z "$DEPLOYMENT" ]]; then
148
149
150
    if [[ -z "$MODEL" ]]; then
        echo "ERROR: --model argument is required"
    fi
151
152
    if [[ -z "$DEPLOYMENT" ]]; then
        echo "ERROR: --deployment argument is required"
153
154
155
156
157
158
159
160
    fi
    echo ""
    usage
fi

# Construct paths based on new structure: recipes/<model>/<framework>/<deployment-type>/
MODEL_DIR="$RECIPES_DIR/$MODEL"
FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}"
161
DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOYMENT"
162
INTEGRATION="$([[ "${GAIE,,}" == "true" ]] && echo gaie || echo "")"
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

# Check if model directory exists
if [[ ! -d "$MODEL_DIR" ]]; then
    echo "Error: Model directory '$MODEL' does not exist in $RECIPES_DIR"
    echo "Available models:"
    ls -1 "$RECIPES_DIR" | grep -v "\.sh$\|\.md$\|model-cache$" | sed 's/^/  /'
    exit 1
fi

# Check if framework directory exists
if [[ ! -d "$FRAMEWORK_DIR" ]]; then
    echo "Error: Framework directory '${FRAMEWORK,,}' does not exist in $MODEL_DIR"
    echo "Available frameworks for $MODEL:"
    ls -1 "$MODEL_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/  /'
    exit 1
fi

# Check if deployment directory exists
if [[ ! -d "$DEPLOY_PATH" ]]; then
182
    echo "Error: Deployment type '$DEPLOYMENT' does not exist in $FRAMEWORK_DIR"
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    echo "Available deployment types for $MODEL/${FRAMEWORK,,}:"
    ls -1 "$FRAMEWORK_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/  /'
    exit 1
fi

# Check if deployment files exist
DEPLOY_FILE="$DEPLOY_PATH/deploy.yaml"
PERF_FILE="$DEPLOY_PATH/perf.yaml"

if [[ ! -f "$DEPLOY_FILE" ]]; then
    echo "Error: Deployment file '$DEPLOY_FILE' not found"
    exit 1
fi

197
198
199
200
201
202
203
# Check if perf file exists (optional)
PERF_AVAILABLE=false
if [[ -f "$PERF_FILE" ]]; then
    PERF_AVAILABLE=true
    echo "Performance benchmark file found: $PERF_FILE"
else
    echo "Performance benchmark file not found: $PERF_FILE (skipping benchmarks)"
204
205
206
207
208
209
210
211
fi

# Show deployment information
echo "======================================"
echo "Dynamo Recipe Deployment"
echo "======================================"
echo "Model: $MODEL"
echo "Framework: ${FRAMEWORK,,}"
212
echo "Deployment Type: $DEPLOYMENT"
213
echo "Namespace: $NAMESPACE"
214
echo "GAIE integration: $GAIE"
215
216
217
218
echo "======================================"

# Handle model downloading
MODEL_CACHE_DIR="$MODEL_DIR/model-cache"
219
220
221
222
223
224
225
226
echo "Creating PVC for model cache and downloading model..."
$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-cache.yaml
$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-download.yaml

# Wait for the model download to complete
MODEL_DOWNLOAD_JOB_NAME=$(grep "name:" $MODEL_CACHE_DIR/model-download.yaml | head -1 | awk '{print $2}')
echo "Waiting for job '$MODEL_DOWNLOAD_JOB_NAME' to complete..."
$DRY_RUN kubectl wait --for=condition=Complete job/$MODEL_DOWNLOAD_JOB_NAME -n $NAMESPACE --timeout=6000s
227
228

# Deploy the specified configuration
229
echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOYMENT configuration..."
230
231
$DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE

232
233
234
235
236
237
238
239
240
if [[ "$INTEGRATION" == "gaie" ]]; then
    # run gaie checks.
    SCRIPT_DIR="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
    "${SCRIPT_DIR}/gaie_checks.sh"
    kubectl apply -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE"
    # For now do not run the benchmark
    exit
 fi

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# Launch the benchmark job (if available)
if [[ "$PERF_AVAILABLE" == "true" ]]; then
    echo "Launching benchmark job..."
    $DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE

    # Construct job name from the perf file
    JOB_NAME=$(grep "name:" $PERF_FILE | head -1 | awk '{print $2}')
    echo "Waiting for job '$JOB_NAME' to complete..."
    $DRY_RUN kubectl wait --for=condition=Complete job/$JOB_NAME -n $NAMESPACE --timeout=6000s

    # Print logs from the benchmark job
    echo "======================================"
    echo "Benchmark completed. Logs:"
    echo "======================================"
    $DRY_RUN kubectl logs job/$JOB_NAME -n $NAMESPACE
else
    echo "======================================"
    echo "Deployment completed successfully!"
    echo "No performance benchmark available for this configuration."
    echo "======================================"
fi