Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
-group: Hardware
+group: Hardware - AMD Build 
 steps:
  - label: "AMD: :docker: build image"
+    key: image-build-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
@@ -9,7 +10,7 @@ steps:
      docker build
      --build-arg max_jobs=16
      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
      -f docker/Dockerfile.rocm

--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -21,6 +21,20 @@ steps:
      pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
      pytest -x -v -s tests/kernels/test_onednn.py"
+- label: CPU-Compatibility Tests
+  depends_on: []
+  soft_fail: true
+  device: intel_cpu
+  no_plugin: true
+  source_file_dependencies:
+  - cmake/cpu_extension.cmake
+  - setup.py
+  - vllm/platforms/cpu.py
+  commands:
+    - |
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
 - label: CPU-Language Generation and Pooling Model Tests
  depends_on: []
  soft_fail: true

--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -8,7 +8,7 @@ clean_docker_tag() {
 }
 print_usage_and_exit() {
-    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
+    echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag> [<image_tag_latest>]"
    exit 1
 }
@@ -142,11 +142,16 @@ resolve_parent_commit() {
 print_bake_config() {
    echo "--- :page_facing_up: Resolved bake configuration"
-    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    # Write to a temp directory to avoid polluting the repo root (which is the
+    # Docker build context). Files left in the repo root get COPY'd into the
+    # image and can cause duplicate artifact uploads from downstream steps.
+    local bake_tmp
+    bake_tmp="$(mktemp -d)"
+    BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
    docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
    echo "--- :arrow_down: Uploading bake config to Buildkite"
-    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+    (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")")
 }
 #################################
@@ -154,7 +159,7 @@ print_bake_config() {
 #################################
 print_instance_info
-if [[ $# -lt 7 ]]; then
+if [[ $# -lt 5 ]]; then
    print_usage_and_exit
 fi
@@ -163,10 +168,8 @@ REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
-VLLM_USE_PRECOMPILED=$5
+IMAGE_TAG=$5
-VLLM_MERGE_BASE_COMMIT=$6
+IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional
-IMAGE_TAG=$7
-IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
 # build config
 TARGET="test-ci"
@@ -193,8 +196,6 @@ export CACHE_FROM
 export CACHE_FROM_BASE_BRANCH
 export CACHE_FROM_MAIN
 export CACHE_TO
-export VLLM_USE_PRECOMPILED
-export VLLM_MERGE_BASE_COMMIT
 # print args
 echo "--- :mag: Arguments"
@@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}"
 echo "REPO: ${REPO}"
 echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
 echo "BRANCH: ${BRANCH}"
-echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
-echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
 echo "IMAGE_TAG: ${IMAGE_TAG}"
 echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"

--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -5,8 +5,7 @@ steps:
    depends_on: []
    timeout_in_minutes: 600
    commands:
-    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi
-    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost

--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,13 +24,11 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --build-arg VLLM_CPU_AVX512BF16=true \
+  --build-arg VLLM_CPU_X86=true \
-  --build-arg VLLM_CPU_AVX512VNNI=true \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
-  --build-arg VLLM_CPU_AMXBF16=true \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -24,10 +24,10 @@ fi
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \
  --target vllm-test \
  --progress plain .
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,10 +11,10 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
 # skip build if image already exists
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
+if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
  echo "Image not found, proceeding with build..."
 else
  echo "Image found"
@@ -25,10 +25,10 @@ fi
 docker build \
  --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
  --build-arg max_jobs=16 \
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
+  --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
-  --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
+  --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \
  --progress plain \
  https://github.com/vllm-project/vllm-gaudi.git
 # push
-docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
+docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \
  --tasks chartqa \
  --batch_size auto \
  --apply_chat_template \
-  --limit $LIMIT
+  --limit "$LIMIT"
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``

--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install "lm-eval[api]>=0.4.9.2"
+#   pip install "lm-eval[api]>=0.4.11"
 usage() {
    echo``
@@ -20,14 +20,11 @@ usage() {
    echo
 }
-while getopts "m:b:l:f:t:" OPT; do
+while getopts "m:l:f:t:" OPT; do
  case ${OPT} in
    m )
        MODEL="$OPTARG"
        ;;
-    b )
-        BATCH_SIZE="$OPTARG"
-        ;;
    l )
        LIMIT="$OPTARG"
        ;;

--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,9 +13,10 @@ import os
 from contextlib import contextmanager
 import lm_eval
-import numpy as np
 import yaml
+from vllm.platforms import current_platform
 DEFAULT_RTOL = 0.08
@@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size):
        "allow_deprecated_quantization=True,"
    )
+    if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]:
+        model_args += "attention_backend=TRITON_ATTN"
    env_vars = eval_config.get("env_vars", None)
    with scoped_env_vars(env_vars):
        results = lm_eval.simple_evaluate(
@@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
                f"ground_truth={ground_truth:.3f} | "
                f"measured={measured_value:.3f} | rtol={rtol}"
            )
-            success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
+            min_acceptable = ground_truth * (1 - rtol)
+            success = success and measured_value >= min_acceptable
    assert success
--- a/.buildkite/performance-benchmarks/README.md
+++ b/.buildkite/performance-benchmarks/README.md
@@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3-8B",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },

--- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py
--- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
--- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
@@ -51,5 +51,56 @@
            "max-model-len": 256,
            "async-scheduling": ""
        }
+    },
+    {
+        "test_name": "latency_deepseek_r1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "dtype": "bfloat16"
+        }
+    },
+    {
+        "test_name": "latency_llama4_maverick_17b128e_instruct_fp8",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "max-model-len": 512,
+            "max-num-seqs": 128,
+            "async-scheduling": "",
+            "gpu-memory-utilization": 0.95,
+            "enable_expert_parallel": ""
+        }
+    },
+    {
+        "test_name": "latency_qwen3_8b",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "Qwen/Qwen3-8B",
+            "tensor_parallel_size": 1,
+            "max-model-len": 2048,
+            "max-num-seqs": 128,
+            "dtype": "bfloat16",
+            "async-scheduling": ""
+        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "openai/whisper-large-v3-turbo"
+    },
+    "client_parameters": {
+      "model": "openai/whisper-large-v3-turbo",
+      "backend": "openai-audio",
+      "endpoint": "/v1/audio/transcriptions",
+      "dataset_name": "hf",
+      "dataset_path": "openslr/librispeech_asr",
+      "hf_subset": "clean",
+      "hf_split": "test",
+      "no_stream": "",
+      "no_oversample": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [
+      32,
+      64,
+      128
+    ],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "dtype": "bfloat16",
+      "model": "jinaai/jina-embeddings-v3",
+      "trust_remote_code": ""
+    },
+    "client_parameters": {
+      "model": "jinaai/jina-embeddings-v3",
+      "backend": "openai-embeddings",
+      "endpoint": "/v1/embeddings",
+      "dataset_name": "sharegpt",
+      "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_jina_embed_v3_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {}
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
+{
+  "defaults": {
+    "qps_list": [
+      "inf"
+    ],
+    "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+    "server_environment_variables": {
+      "VLLM_RPC_TIMEOUT": 100000,
+      "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+      "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+      "VLLM_CPU_SGL_KERNEL": 1,
+      "VLLM_CPU_KVCACHE_SPACE": 40
+    },
+    "server_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "tensor_parallel_size": 1,
+      "dtype": "bfloat16",
+      "distributed_executor_backend": "mp",
+      "block_size": 128,
+      "trust_remote_code": "",
+      "disable_log_stats": "",
+      "max_num_batched_tokens": 2048,
+      "max_num_seqs": 256
+    },
+    "client_parameters": {
+      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "backend": "vllm",
+      "ignore-eos": "",
+      "num_prompts": 200
+    }
+  },
+  "tests": [
+    {
+      "test_name": "serving_llama8B_tp1_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_sharegpt",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "sharegpt",
+        "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json"
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_128_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_tp4_random_2048_2048",
+      "server_parameters": {
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "dataset_name": "random",
+        "random-input-len": 2048,
+        "random-output-len": 2048
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp1_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int4_tp4_random_128_128",
+      "server_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp1_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp2_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 2
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama8B_int8_tp4_random_128_128",
+      "server_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "tensor_parallel_size": 4
+      },
+      "client_parameters": {
+        "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_llama3B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_granite2B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "ibm-granite/granite-3.2-2b-instruct",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen1.7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-1.7B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen4B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-4B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_qwen8B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "Qwen/Qwen3-8B",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_glm9B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "zai-org/glm-4-9b-hf",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    },
+    {
+      "test_name": "serving_gemma7B_tp1_random_128_128",
+      "server_parameters": {
+        "model": "google/gemma-7b",
+        "tensor_parallel_size": 1
+      },
+      "client_parameters": {
+        "model": "google/gemma-7b",
+        "dataset_name": "random",
+        "random-input-len": 128,
+        "random-output-len": 128
+      }
+    }
+  ]
+}
--- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
@@ -72,17 +72,6 @@
        "random-output-len": 128
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_128",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_128_2048",
      "server_parameters": {
@@ -105,17 +94,6 @@
        "random-output-len": 2048
      }
    },
-    {
-      "test_name": "serving_llama8B_tp4_random_128_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 2048
-      }
-    },
    {
      "test_name": "serving_llama8B_tp1_random_2048_128",
      "server_parameters": {
@@ -139,144 +117,25 @@
      }
    },
    {
-      "test_name": "serving_llama8B_tp4_random_2048_128",
+      "test_name": "serving_llama8B_tp1_random_2048_2048",
-      "server_parameters": {
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "dataset_name": "random",
-        "random-input-len": 2048,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp1_random_128_128",
      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 1
      },
      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "dataset_name": "random",
-        "random-input-len": 128,
+        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
      }
    },
    {
-      "test_name": "serving_llama8B_int4_tp2_random_128_128",
+      "test_name": "serving_llama8B_tp2_random_2048_2048",
      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
        "tensor_parallel_size": 2
      },
      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama8B_int4_tp4_random_128_128",
-      "server_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "tensor_parallel_size": 4
-      },
-      "client_parameters": {
-        "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_llama3B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "meta-llama/Llama-3.2-3B-Instruct",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_granite2B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "ibm-granite/granite-3.2-2b-instruct",
        "dataset_name": "random",
-        "random-input-len": 128,
+        "random-input-len": 2048,
-        "random-output-len": 128
+        "random-output-len": 2048
-      }
-    },
-    {
-      "test_name": "serving_qwen1.7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-1.7B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen4B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-4B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_qwen8B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "Qwen/Qwen3-8B",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_glm9B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "zai-org/glm-4-9b-hf",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
-      }
-    },
-    {
-      "test_name": "serving_gemma7B_tp1_random_128_128",
-      "server_parameters": {
-        "model": "google/gemma-7b",
-        "tensor_parallel_size": 1
-      },
-      "client_parameters": {
-        "model": "google/gemma-7b",
-        "dataset_name": "random",
-        "random-input-len": 128,
-        "random-output-len": 128
      }
    }
  ]