#/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. model=neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic # Input Sequence Length (isl) 3000 and Output Sequence Length (osl) 150 are # selected for chat use case. Note that for other use cases, the results and # tuning would vary. isl=3000 osl=150 # Concurrency levels to test for concurrency in 1 2 4 8 16 32 64 128 256; do genai-perf profile \ --model ${model} \ --tokenizer ${model} \ --service-kind openai \ --endpoint-type chat \ --endpoint /v1/chat/completions \ --streaming \ --url http://localhost:8000 \ --synthetic-input-tokens-mean ${isl} \ --synthetic-input-tokens-stddev 0 \ --output-tokens-mean ${osl} \ --output-tokens-stddev 0 \ --extra-inputs max_tokens:${osl} \ --extra-inputs min_tokens:${osl} \ --extra-inputs ignore_eos:true \ --concurrency ${concurrency} \ --request-count $(($concurrency*10)) \ --warmup-request-count $(($concurrency*2)) \ --num-dataset-entries $(($concurrency*12)) \ --random-seed 100 \ -- \ -v \ --max-threads 256 \ -H 'Authorization: Bearer NOT USED' \ -H 'Accept: text/event-stream' done