[CI]add genai-perf benchmark in nightly benchmark (#10704)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>

[CI]add genai-perf benchmark in nightly benchmark (#10704)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
fead53ba · Kunshang Ji · GitHub · ebc73f28 · fead53ba · fead53ba
Unverified Commit fead53ba authored Jan 17, 2025 by Kunshang Ji Committed by GitHub Jan 17, 2025
4 changed files
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -301,6 +301,104 @@ run_serving_tests() {
  kill_gpu_processes
 }

+run_genai_perf_tests() {
+  # run genai-perf tests 
+
+  # $1: a json file specifying genai-perf test cases
+  local genai_perf_test_file
+  genai_perf_test_file=$1
+
+  # Iterate over genai-perf tests
+  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')    
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # prepend the current serving engine to the test name
+    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if [[ $reuse_server == "true" ]]; then
+      echo "Reuse previous server for test case $test_name"
+    else
+      kill_gpu_processes
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
+        "$server_params" "$common_params"
+    fi
+
+    if wait_for_server; then
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
+    else
+      echo ""
+      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps=$num_prompts
+        echo "now qps is $qps"
+      fi
+    
+      new_test_name=$test_name"_qps_"$qps
+      backend=$CURRENT_LLM_SERVING_ENGINE
+      
+      if [[ "$backend" == *"vllm"* ]]; then
+        backend="vllm"
+      fi
+      #TODO: add output dir.
+      client_command="genai-perf profile \
+        -m $model \
+        --service-kind openai \
+        --backend vllm \
+        --endpoint-type chat \
+        --streaming \
+        --url localhost:$port \
+        --request-rate $qps \
+        --num-prompts $num_prompts \
+      "
+
+    echo "Client command: $client_command"
+
+    eval "$client_command"
+
+    #TODO: process/record outputs
+    done
+  done
+
+  kill_gpu_processes
+
+}

 prepare_dataset() {

@@ -328,12 +426,17 @@ main() {

  pip install -U transformers

+  pip install -r requirements-dev.txt
+  which genai-perf
+
  # check storage
  df -h

  ensure_installed wget
  ensure_installed curl
  ensure_installed jq
+  # genai-perf dependency
+  ensure_installed libb64-0d

  prepare_dataset

@@ -345,6 +448,10 @@ main() {
  # run the test
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"

+  # run genai-perf tests
+  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
+  mv artifacts/ $RESULTS_FOLDER/
+
  # upload benchmark results to buildkite
  python3 -m pip install tabulate pandas
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"

--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+[
+    {
+        "test_name": "llama8B_tp1_genai_perf",
+        "qps_list": [4,8,16,32],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "tp": 1,
+            "port": 8000,
+            "num_prompts": 500,
+            "reuse_server": false
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "gpu_memory_utilization": 0.9,
+            "num_scheduler_steps": 10,
+            "max_num_seqs": 512,
+            "dtype": "bfloat16"
+        },
+        "genai_perf_input_parameters": {
+        }
+    }
+]
\ No newline at end of file
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 bitsandbytes>=0.45.0
 buildkite-test-collector==0.1.9

+genai_perf==0.0.8
+tritonclient==2.51.0
+
 numpy < 2.0.0
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -37,7 +37,7 @@ audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements-test.in
-bitsandbytes>=0.45.0
+bitsandbytes==0.45.0
    # via -r requirements-test.in
 black==24.10.0
    # via datamodel-code-generator
@@ -75,6 +75,8 @@ colorama==0.4.6
    #   tqdm-multiprocess
 contourpy==1.3.0
    # via matplotlib
+cramjam==2.9.0
+    # via fastparquet
 cupy-cuda12x==13.3.0
    # via ray
 cycler==0.12.1
@@ -109,6 +111,8 @@ email-validator==2.2.0
    # via pydantic
 evaluate==0.4.3
    # via lm-eval
+fastparquet==2024.11.0
+    # via genai-perf
 fastrlock==0.8.2
    # via cupy-cuda12x
 filelock==3.16.1
@@ -130,8 +134,11 @@ fsspec[http]==2024.9.0
    # via
    #   datasets
    #   evaluate
+    #   fastparquet
    #   huggingface-hub
    #   torch
+genai-perf==0.0.8
+    # via -r requirements-test.in
 genson==1.3.0
    # via datamodel-code-generator
 h11==0.14.0
@@ -186,6 +193,8 @@ jsonschema==4.23.0
    #   ray
 jsonschema-specifications==2024.10.1
    # via jsonschema
+kaleido==0.2.1
+    # via genai-perf
 kiwisolver==1.4.7
    # via matplotlib
 lazy-loader==0.4
@@ -200,6 +209,8 @@ lm-eval[api]==0.4.4
    # via -r requirements-test.in
 lxml==5.3.0
    # via sacrebleu
+markdown-it-py==3.0.0
+    # via rich
 markupsafe==3.0.2
    # via jinja2
 matplotlib==3.9.2
@@ -209,6 +220,8 @@ mbstrdecoder==1.1.3
    #   dataproperty
    #   pytablewriter
    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
 mistral-common[opencv]==1.5.1
    # via
    #   -r requirements-test.in
@@ -249,6 +262,8 @@ numpy==1.26.4
    #   datasets
    #   decord
    #   evaluate
+    #   fastparquet
+    #   genai-perf
    #   librosa
    #   matplotlib
    #   mistral-common
@@ -256,15 +271,18 @@ numpy==1.26.4
    #   numexpr
    #   opencv-python-headless
    #   pandas
+    #   patsy
    #   peft
    #   rouge-score
    #   sacrebleu
    #   scikit-learn
    #   scipy
    #   soxr
+    #   statsmodels
    #   tensorizer
    #   torchvision
    #   transformers
+    #   tritonclient
 nvidia-cublas-cu12==12.4.5.8
    # via
    #   nvidia-cudnn-cu12
@@ -306,30 +324,39 @@ packaging==24.1
    #   datamodel-code-generator
    #   datasets
    #   evaluate
+    #   fastparquet
    #   huggingface-hub
    #   lazy-loader
    #   matplotlib
    #   peft
+    #   plotly
    #   pooch
    #   pytest
    #   pytest-rerunfailures
    #   ray
+    #   statsmodels
    #   transformers
    #   typepy
 pandas==2.2.3
    # via
    #   datasets
    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   statsmodels
 pathspec==0.12.1
    # via black
 pathvalidate==3.2.1
    # via pytablewriter
+patsy==1.0.1
+    # via statsmodels
 peft==0.13.2
    # via
    #   -r requirements-test.in
    #   lm-eval
 pillow==10.4.0
    # via
+    #   genai-perf
    #   matplotlib
    #   mistral-common
    #   sentence-transformers
@@ -338,6 +365,8 @@ platformdirs==4.3.6
    # via
    #   black
    #   pooch
+plotly==5.24.1
+    # via genai-perf
 pluggy==1.5.0
    # via pytest
 pooch==1.8.2
@@ -360,7 +389,9 @@ psutil==6.1.0
 py==1.11.0
    # via pytest-forked
 pyarrow==18.0.0
-    # via datasets
+    # via
+    #   datasets
+    #   genai-perf
 pyasn1==0.6.1
    # via rsa
 pybind11==2.13.6
@@ -373,6 +404,8 @@ pydantic[email]==2.9.2
    #   mistral-common
 pydantic-core==2.23.4
    # via pydantic
+pygments==2.18.0
+    # via rich
 pyparsing==3.2.0
    # via matplotlib
 pytablewriter==1.2.0
@@ -381,14 +414,18 @@ pytest==8.3.3
    # via
    #   -r requirements-test.in
    #   buildkite-test-collector
+    #   genai-perf
    #   pytest-asyncio
    #   pytest-forked
+    #   pytest-mock
    #   pytest-rerunfailures
    #   pytest-shard
 pytest-asyncio==0.24.0
    # via -r requirements-test.in
 pytest-forked==1.6.0
    # via -r requirements-test.in
+pytest-mock==3.14.0
+    # via genai-perf
 pytest-rerunfailures==14.0
    # via -r requirements-test.in
 pytest-shard==0.1.2
@@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0
    #   matplotlib
    #   pandas
    #   typepy
+python-rapidjson==1.20
+    # via tritonclient
 pytz==2024.2
    # via
    #   pandas
@@ -409,9 +448,11 @@ pyyaml==6.0.2
    #   awscli
    #   datamodel-code-generator
    #   datasets
+    #   genai-perf
    #   huggingface-hub
    #   peft
    #   ray
+    #   responses
    #   timm
    #   transformers
 ray[adag]==2.40.0
@@ -438,8 +479,13 @@ requests==2.32.3
    #   mistral-common
    #   pooch
    #   ray
+    #   responses
    #   tiktoken
    #   transformers
+responses==0.25.3
+    # via genai-perf
+rich==13.9.4
+    # via genai-perf
 rouge-score==0.1.2
    # via lm-eval
 rpds-py==0.20.1
@@ -470,6 +516,7 @@ scipy==1.13.1
    #   librosa
    #   scikit-learn
    #   sentence-transformers
+    #   statsmodels
 sentence-transformers==3.2.1
    # via -r requirements-test.in
 sentencepiece==0.2.0
@@ -490,6 +537,8 @@ soxr==0.5.0.post1
    # via librosa
 sqlitedict==2.1.0
    # via lm-eval
+statsmodels==0.14.4
+    # via genai-perf
 sympy==1.13.1
    # via torch
 tabledata==1.3.3
@@ -499,7 +548,9 @@ tabulate==0.9.0
 tcolorpy==0.1.6
    # via pytablewriter
 tenacity==9.0.0
-    # via lm-eval
+    # via
+    #   lm-eval
+    #   plotly
 tensorizer==2.9.0
    # via -r requirements-test.in
 threadpoolctl==3.5.0
@@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11
    # via lm-eval
 transformers==4.47.0
    # via
+    #   genai-perf
    #   lm-eval
    #   peft
    #   sentence-transformers
@@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5
    # via -r requirements-test.in
 triton==3.1.0
    # via torch
+tritonclient==2.51.0
+    # via
+    #   -r requirements-test.in
+    #   genai-perf
 typepy[datetime]==1.3.2
    # via
    #   dataproperty
@@ -555,6 +611,7 @@ typepy[datetime]==1.3.2
    #   tabledata
 typing-extensions==4.12.2
    # via
+    #   bitsandbytes
    #   huggingface-hub
    #   librosa
    #   mistral-common
@@ -563,10 +620,12 @@ typing-extensions==4.12.2
    #   torch
 tzdata==2024.2
    # via pandas
-urllib3==1.26.20
+urllib3==2.2.3
    # via
    #   botocore
    #   requests
+    #   responses
+    #   tritonclient
 word2number==1.1
    # via lm-eval
 xxhash==3.5.0