Unverified Commit 8585c300 authored by richardhuo-nv's avatar richardhuo-nv Committed by GitHub
Browse files

docs: add concurrency choice to the perf.sh (#1497)

parent 73e0f8ca
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Default Values
# Parse command line arguments
model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic" model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic"
url="http://localhost:8000" url="http://localhost:8000"
mode="aggregated" mode="aggregated"
artifacts_root_dir="artifacts_root" artifacts_root_dir="artifacts_root"
deployment_kind="dynamo" deployment_kind="dynamo"
concurrency_list="1,2,4,8,16,32,64,128,256"
# Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are # Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are
# selected for chat use case. Note that for other use cases, the results and # selected for chat use case. Note that for other use cases, the results and
...@@ -35,30 +35,54 @@ prefill_dp=0 ...@@ -35,30 +35,54 @@ prefill_dp=0
decode_tp=0 decode_tp=0
decode_dp=0 decode_dp=0
print_help() {
echo "Usage: $0 [OPTIONS]"
echo
echo "Options:"
echo " --tensor-parallelism, --tp <int> Tensor parallelism (default: $tp)"
echo " --data-parallelism, --dp <int> Data parallelism (default: $dp)"
echo " --prefill-tensor-parallelism, --prefill-tp <int> Prefill tensor parallelism (default: $prefill_tp)"
echo " --prefill-data-parallelism, --prefill-dp <int> Prefill data parallelism (default: $prefill_dp)"
echo " --decode-tensor-parallelism, --decode-tp <int> Decode tensor parallelism (default: $decode_tp)"
echo " --decode-data-parallelism, --decode-dp <int> Decode data parallelism (default: $decode_dp)"
echo " --model <model_id> Hugging Face model ID to benchmark (default: $model)"
echo " --input-sequence-length, --isl <int> Input sequence length (default: $isl)"
echo " --output-sequence-length, --osl <int> Output sequence length (default: $osl)"
echo " --url <http://host:port> Target URL for inference requests (default: $url)"
echo " --concurrency <list> Comma-separated concurrency levels (default: $concurrency_list)"
echo " --mode <aggregated|disaggregated> Serving mode (default: $mode)"
echo " --artifacts-root-dir <path> Root directory to store benchmark results (default: $artifacts_root_dir)"
echo " --deployment-kind <type> Deployment tag used for pareto chart labels (default: $deployment_kind)"
echo " --help Show this help message and exit"
echo
exit 0
}
# Parse command line arguments
# The defaults can be overridden by command line arguments. # The defaults can be overridden by command line arguments.
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--tensor-parallelism) --tensor-parallelism|--tp)
tp="$2" tp="$2"
shift 2 shift 2
;; ;;
--data-parallelism) --data-parallelism|--dp)
dp="$2" dp="$2"
shift 2 shift 2
;; ;;
--prefill-tensor-parallelism) --prefill-tensor-parallelism|--prefill-tp)
prefill_tp="$2" prefill_tp="$2"
shift 2 shift 2
;; ;;
--prefill-data-parallelism) --prefill-data-parallelism|--prefill-dp)
prefill_dp="$2" prefill_dp="$2"
shift 2 shift 2
;; ;;
--decode-tensor-parallelism) --decode-tensor-parallelism|--decode-tp)
decode_tp="$2" decode_tp="$2"
shift 2 shift 2
;; ;;
--decode-data-parallelism) --decode-data-parallelism|--decode-dp)
decode_dp="$2" decode_dp="$2"
shift 2 shift 2
;; ;;
...@@ -66,11 +90,11 @@ while [[ $# -gt 0 ]]; do ...@@ -66,11 +90,11 @@ while [[ $# -gt 0 ]]; do
model="$2" model="$2"
shift 2 shift 2
;; ;;
--input-sequence-length) --input-sequence-length|--isl)
isl="$2" isl="$2"
shift 2 shift 2
;; ;;
--output-sequence-length) --output-sequence-length|--osl)
osl="$2" osl="$2"
shift 2 shift 2
;; ;;
...@@ -78,6 +102,10 @@ while [[ $# -gt 0 ]]; do ...@@ -78,6 +102,10 @@ while [[ $# -gt 0 ]]; do
url="$2" url="$2"
shift 2 shift 2
;; ;;
--concurrency)
concurrency_list="$2"
shift 2
;;
--mode) --mode)
mode="$2" mode="$2"
shift 2 shift 2
...@@ -90,6 +118,9 @@ while [[ $# -gt 0 ]]; do ...@@ -90,6 +118,9 @@ while [[ $# -gt 0 ]]; do
deployment_kind="$2" deployment_kind="$2"
shift 2 shift 2
;; ;;
--help)
print_help
;;
*) *)
echo "Unknown option: $1" echo "Unknown option: $1"
exit 1 exit 1
...@@ -97,6 +128,20 @@ while [[ $# -gt 0 ]]; do ...@@ -97,6 +128,20 @@ while [[ $# -gt 0 ]]; do
esac esac
done done
# Function to validate if concurrency values are positive integers
validate_concurrency() {
for val in "${concurrency_array[@]}"; do
if ! [[ "$val" =~ ^[0-9]+$ ]] || [ "$val" -le 0 ]; then
echo "Error: Invalid concurrency value '$val'. Must be a positive integer." >&2
exit 1
fi
done
}
IFS=',' read -r -a concurrency_array <<< "$concurrency_list"
# Validate concurrency values
validate_concurrency
if [ "${mode}" == "aggregated" ]; then if [ "${mode}" == "aggregated" ]; then
if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then
echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode." echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode."
...@@ -157,8 +202,15 @@ if [ $index -gt 0 ]; then ...@@ -157,8 +202,15 @@ if [ $index -gt 0 ]; then
echo "--------------------------------" echo "--------------------------------"
fi fi
echo "Running genai-perf with:"
echo "Model: $model"
echo "ISL: $isl"
echo "OSL: $osl"
echo "Concurrency levels: ${concurrency_array[@]}"
# Concurrency levels to test # Concurrency levels to test
for concurrency in 1 2 4 8 16 32 64 128 256; do for concurrency in "${concurrency_array[@]}"; do
echo "Run concurrency: $concurrency"
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like # NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec. # `ignore_eos` since they are not in the official OpenAI spec.
...@@ -185,7 +237,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do ...@@ -185,7 +237,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
--artifact-dir ${artifact_dir} \ --artifact-dir ${artifact_dir} \
-- \ -- \
-v \ -v \
--max-threads 256 \ --max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \ -H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream' -H 'Accept: text/event-stream'
......
...@@ -242,6 +242,23 @@ Single-Node ...@@ -242,6 +242,23 @@ Single-Node
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2 bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
``` ```
We could also run the benchmarking script and specify the model, input sequence length, output sequence length, and concurrency levels to target for benchmarking:
```bash
bash -x /workspace/benchmarks/llm/perf.sh \
--mode aggregated \
--deployment-kind vllm_serve \
--tensor-parallelism 1 \
--data-parallelism 1 \
--model neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
--input-sequence-length 3000 \
--output-sequence-length 150 \
--url http://localhost:8000 \
--concurrency 1,2,4,8,16,32,64,128,256
# The `--concurrency` option accepts either a single value (e.g., 64) or a comma-separated list (e.g., 1,2,4,8) to specify multiple concurrency levels for benchmarking.
```
> [!Important] > [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes). > We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment