#/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Parse command line arguments model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic" url="http://localhost:8000" mode="aggregated" artifacts_root_dir="artifacts_root" deployment_kind="dynamo" # Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are # selected for chat use case. Note that for other use cases, the results and # tuning would vary. isl=3000 osl=150 tp=0 dp=0 prefill_tp=0 prefill_dp=0 decode_tp=0 decode_dp=0 # The defaults can be overridden by command line arguments. while [[ $# -gt 0 ]]; do case $1 in --tensor-parallelism) tp="$2" shift 2 ;; --data-parallelism) dp="$2" shift 2 ;; --prefill-tensor-parallelism) prefill_tp="$2" shift 2 ;; --prefill-data-parallelism) prefill_dp="$2" shift 2 ;; --decode-tensor-parallelism) decode_tp="$2" shift 2 ;; --decode-data-parallelism) decode_dp="$2" shift 2 ;; --model) model="$2" shift 2 ;; --input-sequence-length) isl="$2" shift 2 ;; --output-sequence-length) osl="$2" shift 2 ;; --url) url="$2" shift 2 ;; --mode) mode="$2" shift 2 ;; --artifacts-root-dir) artifacts_root_dir="$2" shift 2 ;; --deployment-kind) deployment_kind="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ "${mode}" == "aggregated" ]; then if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode." exit 1 fi echo "Starting benchmark for the deployment service with the following configuration:" echo " - Tensor Parallelism: ${tp}" echo " - Data Parallelism: ${dp}" elif [ "${mode}" == "disaggregated" ]; then if [ "${prefill_tp}" == "0" ] && [ "${prefill_dp}" == "0" ] && [ "${decode_tp}" == "0" ] && [ "${decode_dp}" == "0" ]; then echo "--prefill-tensor-parallelism, --prefill-data-parallelism, --decode-tensor-parallelism and --decode-data-parallelism must be set for disaggregated mode." exit 1 fi echo "Starting benchmark for the deployment service with the following configuration:" echo " - Prefill Tensor Parallelism: ${prefill_tp}" echo " - Prefill Data Parallelism: ${prefill_dp}" echo " - Decode Tensor Parallelism: ${decode_tp}" echo " - Decode Data Parallelism: ${decode_dp}" else echo "Unknown mode: ${mode}. Only 'aggregated' and 'disaggregated' modes are supported." exit 1 fi echo "--------------------------------" echo "WARNING: This script does not validate tensor_parallelism=${tp} and data_parallelism=${dp} settings." echo " The user is responsible for ensuring these match the deployment configuration being benchmarked." echo " Incorrect settings may lead to misleading benchmark results." echo "--------------------------------" # Create artifacts root directory if it doesn't exist if [ ! -d "${artifacts_root_dir}" ]; then mkdir -p "${artifacts_root_dir}" fi # Find the next available artifacts directory index index=0 while [ -d "${artifacts_root_dir}/artifacts_${index}" ]; do index=$((index + 1)) done # Create the new artifacts directory artifact_dir="${artifacts_root_dir}/artifacts_${index}" mkdir -p "${artifact_dir}" # Print warning about existing artifacts directories if [ $index -gt 0 ]; then echo "--------------------------------" echo "WARNING: Found ${index} existing artifacts directories:" for ((i=0; i "${artifact_dir}/deployment_config.json" echo "Benchmarking Successful!!!"