benchmark_utils.sh 2.4 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0

wait_for_model() {

    local model_host=$1
    local model_port=$2
9
10
11
12
13
    local n_prefill=${3:-1}
    local n_decode=${4:-1}
    local poll=${5:-1}
    local timeout=${6:-600}
    local report_every=${7:-60}
14
15

    local health_addr="http://${model_host}:${model_port}/health"
16
    echo "Polling ${health_addr} every ${poll} seconds to check whether ${n_prefill} prefills and ${n_decode} decodes are alive"
17
18
19
20
21

    local start_ts=$(date +%s)
    local report_ts=$(date +%s)

    while :; do
22
        # Curl timeout - our primary use case here is to launch it at the first node (localhost), so no timeout is needed.
23
        curl_result=$(curl ${health_addr} 2>/dev/null)
24
25
26
27
28
        # Python path - Use of `check_server_health.py` is self-constrained outside of any packaging.
        check_result=$(python3 /scripts/check_server_health.py $n_prefill $n_decode <<< $curl_result)
        if [[ $check_result == *"Model is ready."* ]]; then
            echo $check_result
            return 0
29
30
31
32
33
34
35
36
37
        fi

        time_now=$(date +%s)
        if [[ $((time_now - start_ts)) -ge $timeout ]]; then
            echo "Model did not get healthy in ${timeout} seconds"
            exit 2;
        fi

        if [[ $((time_now - report_ts)) -ge $report_every ]]; then
38
            echo $check_result
39
40
41
42
43
44
45
46
47
48
49
50
51
52
            report_ts=$time_now
        fi

        sleep $poll
    done
}

warmup_model() {
    service_host=$1
    service_port=$2
    served_model_name=$3
    model_path=$4
    config=$5

53
54
55
56
57
58
59
60
    model_name="deepseek-ai/DeepSeek-R1"
    model_path="deepseek-ai/DeepSeek-R1-0528"
    head_node="localhost"
    head_port="8000"
    chosen_isl=1024
    chosen_osl=1024
    chosen_req_rate="inf"
    chosen_concurrencies=(1 2 4 8 16 32 64 128)
61

62
63
64
	for concurrency in ${chosen_concurrencies[@]}
	do
	    num_prompts=$((concurrency * 5))
65

66
67
68
69
70
71
72
73
74
	    command=(
		python3 -m sglang.bench_serving
		--base-url "http://${head_node}:${head_port}"
		--model ${model_name} --tokenizer ${model_path}
		--backend sglang-oai
		--dataset-name random --random-input ${chosen_isl} --random-output ${chosen_osl}
		--random-range-ratio 1
		--num-prompts ${num_prompts} --request-rate ${chosen_req_rate} --max-concurrency ${concurrency}
	    )
75

76
77
78
79
	    echo "Running with concurrency: ${concurrency}, num_prompts: ${num_prompts}"
	    "${command[@]}"
	done
}