Unverified Commit 8585c300 authored by richardhuo-nv's avatar richardhuo-nv Committed by GitHub
Browse files

docs: add concurrency choice to the perf.sh (#1497)

parent 73e0f8ca
......@@ -14,13 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Parse command line arguments
# Default Values
model="neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic"
url="http://localhost:8000"
mode="aggregated"
artifacts_root_dir="artifacts_root"
deployment_kind="dynamo"
concurrency_list="1,2,4,8,16,32,64,128,256"
# Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are
# selected for chat use case. Note that for other use cases, the results and
......@@ -35,42 +35,66 @@ prefill_dp=0
decode_tp=0
decode_dp=0
print_help() {
echo "Usage: $0 [OPTIONS]"
echo
echo "Options:"
echo " --tensor-parallelism, --tp <int> Tensor parallelism (default: $tp)"
echo " --data-parallelism, --dp <int> Data parallelism (default: $dp)"
echo " --prefill-tensor-parallelism, --prefill-tp <int> Prefill tensor parallelism (default: $prefill_tp)"
echo " --prefill-data-parallelism, --prefill-dp <int> Prefill data parallelism (default: $prefill_dp)"
echo " --decode-tensor-parallelism, --decode-tp <int> Decode tensor parallelism (default: $decode_tp)"
echo " --decode-data-parallelism, --decode-dp <int> Decode data parallelism (default: $decode_dp)"
echo " --model <model_id> Hugging Face model ID to benchmark (default: $model)"
echo " --input-sequence-length, --isl <int> Input sequence length (default: $isl)"
echo " --output-sequence-length, --osl <int> Output sequence length (default: $osl)"
echo " --url <http://host:port> Target URL for inference requests (default: $url)"
echo " --concurrency <list> Comma-separated concurrency levels (default: $concurrency_list)"
echo " --mode <aggregated|disaggregated> Serving mode (default: $mode)"
echo " --artifacts-root-dir <path> Root directory to store benchmark results (default: $artifacts_root_dir)"
echo " --deployment-kind <type> Deployment tag used for pareto chart labels (default: $deployment_kind)"
echo " --help Show this help message and exit"
echo
exit 0
}
# Parse command line arguments
# The defaults can be overridden by command line arguments.
while [[ $# -gt 0 ]]; do
case $1 in
--tensor-parallelism)
--tensor-parallelism|--tp)
tp="$2"
shift 2
;;
--data-parallelism)
--data-parallelism|--dp)
dp="$2"
shift 2
;;
--prefill-tensor-parallelism)
--prefill-tensor-parallelism|--prefill-tp)
prefill_tp="$2"
shift 2
;;
--prefill-data-parallelism)
--prefill-data-parallelism|--prefill-dp)
prefill_dp="$2"
shift 2
;;
--decode-tensor-parallelism)
--decode-tensor-parallelism|--decode-tp)
decode_tp="$2"
shift 2
;;
--decode-data-parallelism)
--decode-data-parallelism|--decode-dp)
decode_dp="$2"
shift 2
;;
--model)
--model)
model="$2"
shift 2
;;
--input-sequence-length)
--input-sequence-length|--isl)
isl="$2"
shift 2
;;
--output-sequence-length)
--output-sequence-length|--osl)
osl="$2"
shift 2
;;
......@@ -78,6 +102,10 @@ while [[ $# -gt 0 ]]; do
url="$2"
shift 2
;;
--concurrency)
concurrency_list="$2"
shift 2
;;
--mode)
mode="$2"
shift 2
......@@ -90,6 +118,9 @@ while [[ $# -gt 0 ]]; do
deployment_kind="$2"
shift 2
;;
--help)
print_help
;;
*)
echo "Unknown option: $1"
exit 1
......@@ -97,6 +128,20 @@ while [[ $# -gt 0 ]]; do
esac
done
# Function to validate if concurrency values are positive integers
validate_concurrency() {
for val in "${concurrency_array[@]}"; do
if ! [[ "$val" =~ ^[0-9]+$ ]] || [ "$val" -le 0 ]; then
echo "Error: Invalid concurrency value '$val'. Must be a positive integer." >&2
exit 1
fi
done
}
IFS=',' read -r -a concurrency_array <<< "$concurrency_list"
# Validate concurrency values
validate_concurrency
if [ "${mode}" == "aggregated" ]; then
if [ "${tp}" == "0" ] && [ "${dp}" == "0" ]; then
echo "--tensor-parallelism and --data-parallelism must be set for aggregated mode."
......@@ -157,8 +202,15 @@ if [ $index -gt 0 ]; then
echo "--------------------------------"
fi
echo "Running genai-perf with:"
echo "Model: $model"
echo "ISL: $isl"
echo "OSL: $osl"
echo "Concurrency levels: ${concurrency_array[@]}"
# Concurrency levels to test
for concurrency in 1 2 4 8 16 32 64 128 256; do
for concurrency in "${concurrency_array[@]}"; do
echo "Run concurrency: $concurrency"
# NOTE: For Dynamo HTTP OpenAI frontend, use `nvext` for fields like
# `ignore_eos` since they are not in the official OpenAI spec.
......@@ -185,7 +237,7 @@ for concurrency in 1 2 4 8 16 32 64 128 256; do
--artifact-dir ${artifact_dir} \
-- \
-v \
--max-threads 256 \
--max-threads ${concurrency} \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
......
......@@ -242,6 +242,23 @@ Single-Node
bash -x /workspace/benchmarks/llm/perf.sh --mode aggregated --deployment-kind vllm_serve --tensor-parallelism 8 --data-parallelism 2
```
We could also run the benchmarking script and specify the model, input sequence length, output sequence length, and concurrency levels to target for benchmarking:
```bash
bash -x /workspace/benchmarks/llm/perf.sh \
--mode aggregated \
--deployment-kind vllm_serve \
--tensor-parallelism 1 \
--data-parallelism 1 \
--model neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic \
--input-sequence-length 3000 \
--output-sequence-length 150 \
--url http://localhost:8000 \
--concurrency 1,2,4,8,16,32,64,128,256
# The `--concurrency` option accepts either a single value (e.g., 64) or a comma-separated list (e.g., 1,2,4,8) to specify multiple concurrency levels for benchmarking.
```
> [!Important]
> We should be careful in specifying these options in `perf.sh` script. They should closely reflect the deployment config that is being benchmarked. See `perf.sh --help` to learn more about these option. In the above command, we described that our deployment is using aggregated serving in `vllm serve`. We have also accurately described that we have 2 workers with TP=4(or TP=8 for two nodes).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment