more models for vLLM Benchmark Suite (#35086)

Signed-off-by: louie-tsai <louie.tsai@intel.com>

more models for vLLM Benchmark Suite (#35086)
Signed-off-by: louie-tsai <louie.tsai@intel.com>
17852aa5 · Louie Tsai · GitHub · 8647c6cf · 17852aa5 · 17852aa5
Unverified Commit 17852aa5 authored Mar 11, 2026 by Louie Tsai Committed by GitHub Mar 12, 2026
8 changed files
--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
@@ -12,6 +12,13 @@ DRY_RUN="${DRY_RUN:-0}"
 MODEL_FILTER="${MODEL_FILTER:-}"
 DTYPE_FILTER="${DTYPE_FILTER:-}"
+# Adaptive search controls
+ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}"
+SLA_TTFT_MS="${SLA_TTFT_MS:-3000}"
+SLA_TPOT_MS="${SLA_TPOT_MS:-100}"
+ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}"
+ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}"
 check_gpus() {
  if command -v nvidia-smi; then
    # check the number of GPUs and GPU type.
@@ -183,6 +190,304 @@ upload_to_buildkite() {
  $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
+# -------------------------------
+# Adaptive concurrency helpers
+# -------------------------------
+result_json_path_for_serving() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json"
+}
+extract_metric_ms() {
+  local metric_name=$1
+  local json_file=$2
+  [[ -f "$json_file" ]] || return 0
+  if [[ "$metric_name" == "ttft" ]]; then
+    jq -r '
+      [
+        .ttft_ms.p99?,
+        .metrics.ttft_ms.p99?,
+        .ttft.p99?,
+        .metrics.ttft.p99?,
+        .p99_ttft_ms?,
+        .ttft_ms.mean?,
+        .metrics.ttft_ms.mean?,
+        .ttft.mean?,
+        .metrics.ttft.mean?,
+        .mean_ttft_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  else
+    jq -r '
+      [
+        .tpot_ms.p99?,
+        .metrics.tpot_ms.p99?,
+        .tpot.p99?,
+        .metrics.tpot.p99?,
+        .p99_tpot_ms?,
+        .itl_ms.p99?,
+        .metrics.itl_ms.p99?,
+        .inter_token_latency_ms.p99?,
+        .tpot_ms.mean?,
+        .metrics.tpot_ms.mean?,
+        .tpot.mean?,
+        .metrics.tpot.mean?,
+        .itl_ms.mean?,
+        .metrics.itl_ms.mean?,
+        .mean_tpot_ms?,
+        .mean_itl_ms?
+      ] | map(select(. != null)) | .[0] // empty
+    ' "$json_file"
+  fi
+}
+evaluate_sla_from_json() {
+  local json_file=$1
+  local ttft
+  local tpot
+  local pass
+  [[ -f "$json_file" ]] || return 2
+  ttft=$(extract_metric_ms ttft "$json_file")
+  tpot=$(extract_metric_ms tpot "$json_file")
+  [[ -n "$ttft" && -n "$tpot" ]] || return 2
+  pass=$(jq -n \
+    --argjson ttft "$ttft" \
+    --argjson tpot "$tpot" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)')
+  [[ "$pass" == "true" ]]
+}
+write_adaptive_summary_json() {
+  local summary_file=$1
+  local test_name=$2
+  local qps=$3
+  local static_last_pass=$4
+  local static_first_fail=$5
+  local final_last_pass=$6
+  local final_first_fail=$7
+  jq -n \
+    --arg test_name "$test_name" \
+    --arg qps "$qps" \
+    --argjson sla_ttft "$SLA_TTFT_MS" \
+    --argjson sla_tpot "$SLA_TPOT_MS" \
+    --arg static_last_pass "${static_last_pass:-}" \
+    --arg static_first_fail "${static_first_fail:-}" \
+    --arg final_last_pass "${final_last_pass:-}" \
+    --arg final_first_fail "${final_first_fail:-}" \
+    '{
+      test_name: $test_name,
+      qps: $qps,
+      sla_ttft_ms: $sla_ttft,
+      sla_tpot_ms: $sla_tpot,
+      static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end),
+      static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end),
+      final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end),
+      final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end)
+    }' > "$summary_file"
+}
+run_single_serving_probe() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+  local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
+  local result_json
+  local num_prompts_arg=""
+  local client_command
+  result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency")
+  if [[ -f "$result_json" ]]; then
+    evaluate_sla_from_json "$result_json"
+    return $?
+  fi
+  if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+    num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+    if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+    if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+    num_prompts_arg="--num-prompts $num_prompts"
+  fi
+  client_command="vllm bench serve \
+    --save-result \
+    --result-dir $RESULTS_FOLDER \
+    --result-filename ${new_test_name}.json \
+    --request-rate $qps \
+    --max-concurrency $max_concurrency \
+    $num_prompts_arg \
+    --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \
+    $client_args_effective $client_remote_args "
+  echo "Adaptive probe: $client_command"
+  if [[ "${DRY_RUN:-0}" != "1" ]]; then
+    bash -c "$client_command"
+  fi
+  jq_output=$(jq -n \
+    --arg server "$server_command" \
+    --arg client "$client_command" \
+    --arg gpu "$gpu_type" \
+    '{
+      server_command: $server,
+      client_command: $client,
+      gpu_type: $gpu,
+      adaptive_search: true
+    }')
+  echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+  evaluate_sla_from_json "$result_json"
+}
+adaptive_refine_from_static_results() {
+  local test_name=$1
+  local qps=$2
+  local max_concurrency_list_raw=$3
+  local tp=$4
+  local compilation_config_mode=$5
+  local optimization_level=$6
+  local client_args_effective=$7
+  local client_remote_args=$8
+  local server_command=$9
+  local sorted_points
+  local point
+  local rc
+  local static_last_pass=""
+  local static_first_fail=""
+  local largest_static=""
+  local step_hint=1
+  local previous_point=""
+  local low
+  local high
+  local mid
+  local probes=0
+  local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json"
+  [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0
+  [[ "${DRY_RUN:-0}" != "1" ]] || return 0
+  sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq)
+  [[ -n "$sorted_points" ]] || return 0
+  while read -r point; do
+    [[ -z "$point" ]] && continue
+    largest_static="$point"
+    evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")"
+    rc=$?
+    if (( rc == 0 )); then
+      static_last_pass="$point"
+    elif (( rc == 1 )); then
+      if [[ -n "$static_last_pass" ]]; then
+        static_first_fail="$point"
+        break
+      fi
+    fi
+    if [[ -n "$previous_point" ]]; then
+      step_hint=$(( point - previous_point ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    fi
+    previous_point="$point"
+  done <<< "$sorted_points"
+  if [[ -z "$static_last_pass" ]]; then
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail"
+    return 0
+  fi
+  if [[ -n "$static_first_fail" ]]; then
+    low=$static_last_pass
+    high=$static_first_fail
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+    write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high"
+    return 0
+  fi
+  low=$largest_static
+  high=""
+  while (( probes < ADAPTIVE_MAX_PROBES )); do
+    point=$(( low + step_hint ))
+    if (( point > ADAPTIVE_MAX_CONCURRENCY )); then
+      point=$ADAPTIVE_MAX_CONCURRENCY
+    fi
+    (( point > low )) || break
+    probes=$(( probes + 1 ))
+    run_single_serving_probe \
+      "$test_name" "$qps" "$point" "$tp" \
+      "$compilation_config_mode" "$optimization_level" \
+      "$client_args_effective" "$client_remote_args" "$server_command"
+    rc=$?
+    if (( rc == 0 )); then
+      low=$point
+      (( point == ADAPTIVE_MAX_CONCURRENCY )) && break
+      step_hint=$(( step_hint * 2 ))
+      if (( step_hint < 1 )); then step_hint=1; fi
+    elif (( rc == 1 )); then
+      high=$point
+      break
+    else
+      break
+    fi
+  done
+  if [[ -n "$high" ]]; then
+    while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do
+      mid=$(( (low + high) / 2 ))
+      probes=$(( probes + 1 ))
+      run_single_serving_probe \
+        "$test_name" "$qps" "$mid" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
+      rc=$?
+      if (( rc == 0 )); then
+        low=$mid
+      elif (( rc == 1 )); then
+        high=$mid
+      else
+        break
+      fi
+    done
+  fi
+  write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high"
+}
 run_benchmark_tests() {
  # run benchmark tests using `vllm bench <test_type>` command
  # $1: test type (latency or throughput)
@@ -347,10 +652,48 @@ run_serving_tests() {
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
    client_params=$(echo "$params" | jq -r '.client_parameters')
-    server_args=$(json2args "$server_params")
+    # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly.
+    server_model=$(echo "$server_params" | jq -r '.model // empty')
+    if [[ -z "$server_model" || "$server_model" == "null" ]]; then
+      echo "Error: serving test '$test_name' is missing server_parameters.model" >&2
+      exit 1
+    fi
+    server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)')
+    server_args=$(json2args "$server_params_no_model")
    server_envs=$(json2envs "$server_envs")
    client_args=$(json2args "$client_params")
+    # ------------------------------------------------------------
+    # Option 1: Dynamic num-prompts scaling based on max_concurrency
+    #
+    # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with:
+    #   num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY
+    #
+    # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior
+    # unchanged (i.e., whatever is in serving-tests-*.json).
+    # ------------------------------------------------------------
+    PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}"  # no default on purpose
+    MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}"
+    MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}"
+    if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates)
+      # Handles: --num-prompts 123   and   --num-prompts=123
+      client_args_no_np="$(
+        printf ' %s ' "$client_args" \
+        | sed -E \
+          -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \
+          -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g'
+      )"
+      # normalize whitespace
+      client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')"
+      client_args_no_np="$(echo "$client_args_no_np" | xargs)"
+      client_args_effective="$client_args_no_np"
+    else
+      client_args_effective="$client_args"
+    fi
    # qps_list
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -382,14 +725,13 @@ run_serving_tests() {
    fi
    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
-    server_command="$server_envs vllm serve \
+    server_command="$server_envs vllm serve $server_model \
      $server_args"
    # run the server
@@ -436,6 +778,14 @@ run_serving_tests() {
      for max_concurrency in $max_concurrency_list; do
        new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}"
        echo " new test name $new_test_name"
+        # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts.
+        num_prompts_arg=""
+        if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then
+          num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY ))
+          if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi
+          if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi
+          num_prompts_arg="--num-prompts $num_prompts"
+        fi
        # pass the tensor parallel size, the compilation mode, and the optimization
        # level to the client so that they can be used on the benchmark dashboard
        client_command="vllm bench serve \
@@ -444,8 +794,9 @@ run_serving_tests() {
          --result-filename ${new_test_name}.json \
          --request-rate $qps \
          --max-concurrency $max_concurrency \
+          $num_prompts_arg \
          --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \
-          $client_args $client_remote_args "
+          $client_args_effective $client_remote_args "
        echo "Running test case $test_name with qps $qps"
        echo "Client command: $client_command"
@@ -467,6 +818,11 @@ run_serving_tests() {
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
      done
+      adaptive_refine_from_static_results \
+        "$test_name" "$qps" "$max_concurrency_list" "$tp" \
+        "$compilation_config_mode" "$optimization_level" \
+        "$client_args_effective" "$client_remote_args" "$server_command"
    done
    # clean up
@@ -532,6 +888,7 @@ main() {
  # postprocess benchmarking results
  pip install tabulate pandas
  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
+  python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json
  upload_to_buildkite
 }

--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "openai/whisper-large-v3-turbo"
+    },
+    "client_parameters": {
+      "model": "openai/whisper-large-v3-turbo",
+      "backend": "openai-audio",
+      "endpoint": "/v1/audio/transcriptions",
+      "dataset_name": "hf",
+      "dataset_path": "openslr/librispeech_asr",
+      "hf_subset": "clean",
+      "hf_split": "test",
+      "no_stream": "",
+      "no_oversample": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
@@ -149,6 +149,39 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
    {
      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
@@ -188,6 +221,45 @@
        "random-output-len": 128
      }
    },
+    {
+      "test_name": "serving_llama8B_int8_tp1_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp2_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp4_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
    {
      "test_name": "serving_llama3B_tp1_random_128_128",
      "server_parameters": {

--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -106,20 +95,20 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
-        "random-input-len": 128,
+        "random-input-len": 2048,
-        "random-output-len": 2048
+        "random-output-len": 128
      }
    },
    {
-      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "test_name": "serving_llama8B_tp2_random_2048_128",
      "server_parameters": {
-        "tensor_parallel_size": 1
+        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
@@ -128,25 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 2
+        "tensor_parallel_size": 1
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
-        "tensor_parallel_size": 4
+        "tensor_parallel_size": 2
      },
      "client_parameters": {
        "dataset_name": "random",
        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
      }
    }
  ]

--- a/docs/benchmarking/dashboard.md
+++ b/docs/benchmarking/dashboard.md
@@ -39,6 +39,12 @@ When run, benchmark script generates results under **benchmark/results** folder,
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+- `PROMPTS_PER_CONCURRENCY`: Multiplier to compute `num_prompts` for serving tests (`num_prompts = max_concurrency × value`). Overrides JSON `num_prompts`. Default is NULL.
+- `ENABLE_ADAPTIVE_CONCURRENCY`: set the value to '1' to enable adaptive SLA-based concurrency search after the static serving max_concurrency sweep. Default value is 0.
+- `SLA_TTFT_MS`: default TTFT SLA threshold in milliseconds for adaptive concurrency search. Default value is 3000.
+- `SLA_TPOT_MS`: default TPOT SLA threshold in milliseconds for adaptive concurrency search. Default value is 100.
+- `ADAPTIVE_MAX_PROBES`: maximum number of extra adaptive search probes. Default value is 8.
+- `ADAPTIVE_MAX_CONCURRENCY`: maximum allowed concurrency during adaptive search. Default value is 1024.
 ### Visualization

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -70,4 +70,7 @@ kaldi-native-fbank >= 1.18.7 # required for fireredasr2 test
 # Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library.
 # Older versions are in conflict with teerratorch requirements.
 datasets>=3.3.0,<=3.6.0
\ No newline at end of file
+openpyxl # required for perf comparison excel report
+plotly # required for perf comparison html report
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -202,6 +202,8 @@ email-validator==2.2.0
    # via pydantic
 encodec==0.1.1
    # via vocos
+et-xmlfile==2.0.0
+    # via openpyxl
 evaluate==0.4.3
    # via lm-eval
 fastapi==0.128.0
@@ -634,6 +636,8 @@ opencv-python-headless==4.13.0.90
    #   albucore
    #   albumentations
    #   mistral-common
+openpyxl==3.1.5
+    # via -r requirements/test.in
 opentelemetry-api==1.35.0
    # via
    #   opentelemetry-exporter-prometheus
@@ -734,7 +738,9 @@ platformdirs==4.3.6
    #   virtualenv
    #   wandb
 plotly==5.24.1
-    # via genai-perf
+    # via
+    #   -r requirements/test.in
+    #   genai-perf
 pluggy==1.5.0
    # via
    #   pytest