Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · 99324e25 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,7 +16,11 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm
 /vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+CMakeLists.txt @tlrmchlsmth @LucasWilkinson
+
+# Any change to the VllmConfig changes can have a large user-facing impact,
+# so spam a lot of people
+/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -27,6 +27,22 @@ pull_request_rules:
      add:
        - ci/build

+- name: label-deepseek
+  description: Automatically apply deepseek label
+  conditions:
+    - or:
+      - files~=^examples/.*deepseek.*\.py
+      - files~=^tests/.*deepseek.*\.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
+      - files~=^vllm/model_executor/models/.*deepseek.*\.py
+      - files~=^vllm/reasoning/.*deepseek.*\.py
+      - files~=^vllm/transformers_utils/.*deepseek.*\.py
+      - title~=(?i)DeepSeek
+  actions:
+    label:
+      add:
+        - deepseek
+
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
@@ -45,6 +61,7 @@ pull_request_rules:
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
      - files~=^vllm/model_executor/models/.*llama.*\.py
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+      - title~=(?i)llama
  actions:
    label:
      add:
@@ -57,14 +74,72 @@ pull_request_rules:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
      - files~=^tests/models/multimodal/
-      - files~=^tests/models/*/audio_language/
-      - files~=^tests/models/*/vision_language/
      - files=tests/models/test_vision.py
  actions:
    label:
      add:
        - multi-modality

+- name: label-new-model
+  description: Automatically apply new-model label
+  conditions:
+    - and:
+      - files~=^vllm/model_executor/models/
+      - files=vllm/model_executor/models/registry.py
+      - files=tests/models/registry.py
+      - files=docs/models/supported_models.md
+  actions:
+    label:
+      add:
+        - new-model
+
+- name: label-performance
+  description: Automatically apply performance label
+  conditions:
+    - or:
+      - files~=^benchmarks/
+      - files~=^vllm/benchmarks/
+      - files~=^tests/benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
+  actions:
+    label:
+      add:
+        - performance
+
+- name: label-qwen
+  description: Automatically apply qwen label
+  conditions:
+    - or:
+      - files~=^examples/.*qwen.*\.py
+      - files~=^tests/.*qwen.*\.py
+      - files~=^vllm/model_executor/models/.*qwen.*\.py
+      - files~=^vllm/reasoning/.*qwen.*\.py
+      - title~=(?i)Qwen
+  actions:
+    label:
+      add:
+        - qwen
+
+- name: label-rocm
+  description: Automatically apply rocm label
+  conditions:
+    - or:
+      - files~=^csrc/rocm/
+      - files~=^docker/Dockerfile.rocm
+      - files~=^requirements/rocm.*\.txt
+      - files~=^vllm/attention/backends/rocm.*\.py
+      - files~=^vllm/attention/ops/rocm.*\.py
+      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^tests/kernels/.*_rocm.*\.py
+      - files=vllm/platforms/rocm.py
+      - title~=(?i)AMD
+      - title~=(?i)ROCm
+  actions:
+    label:
+      add:
+        - rocm
+
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
@@ -92,8 +167,14 @@ pull_request_rules:
  conditions:
    - or:
      - files~=^vllm/spec_decode/
+      - files~=^vllm/v1/spec_decode/
      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
      - files~=^tests/spec_decode/
+      - files~=^tests/v1/spec_decode/
+      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
+      - files~=^vllm/model_executor/models/.*eagle.*\.py
+      - files=vllm/model_executor/models/mlp_speculator.py
+      - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
  actions:
    label:
      add:

--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -68,7 +68,7 @@ jobs:
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"

      - name: curl test
        run: |

--- a/.gitignore
+++ b/.gitignore
@@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/

-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,12 +20,10 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
  hooks:
-  - id: codespell
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
+  - id: typos
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
@@ -55,6 +53,11 @@ repos:
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
+  - id: format-torch-nightly-test
+    name: reformat nightly_torch_test.txt to be in sync with test.in
+    language: python
+    entry: python tools/generate_nightly_torch_test.py
+    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
    entry: tools/mypy.sh 0 "local"
@@ -117,6 +120,11 @@ repos:
    entry: python tools/check_spdx_header.py
    language: python
    types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
  - id: check-filenames
    name: Check for spaces in all filenames
    entry: bash
@@ -145,6 +153,20 @@ repos:
    types: [python]
    pass_filenames: false
    additional_dependencies: [regex]
+  - id: check-pickle-imports
+    name: Prevent new pickle/cloudpickle imports
+    entry: python tools/check_pickle_imports.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [pathspec, regex]
+  - id: validate-config
+    name: Validate configuration has default values and that each field has a docstring
+    entry: python tools/validate_config.py
+    language: python
+    types: [python]
+    pass_filenames: true
+    files: vllm/config.py|tests/test_config.py
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -260,7 +260,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")

  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")

  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -421,9 +421,39 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
+
+  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
+
+  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
+  # require CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@@ -514,6 +544,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -543,13 +574,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  # CUTLASS MoE kernels

-  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
  # if it's possible to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -563,6 +593,46 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "if you intend on running FP8 quantized MoE models on Hopper.")
    else()
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+
+  # moe_data.cu is used by all CUTLASS MoE kernels.
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+      message(STATUS "Not building moe_data as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
+    else()
+      message(STATUS "Not building moe_data as no compatible archs found "
+                     "in CUDA target architectures.")
+    endif()
+  endif()
+  
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+                     "if you intend on running FP8 quantized MoE models on Blackwell.")
+    else()
+      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
                     "in CUDA target architectures")
    endif()
  endif()
@@ -639,6 +709,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 # if CUDA endif
 endif()

+if (VLLM_GPU_LANG STREQUAL "HIP")
+  # Add QuickReduce kernels
+  list(APPEND VLLM_EXT_SRC
+    "csrc/custom_quickreduce.cu"
+  )
+# if ROCM endif
+endif()
+
 message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C

--- a/README.md
+++ b/README.md
@@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs

 ## Contact Us

+<!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+<!-- --8<-- [end:contact-us] -->

 ## Media Kit


--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.

-## Dataset Overview
+**Dataset Overview**

 <table style="width:100%; border-collapse: collapse;">
  <thead>
@@ -82,7 +82,10 @@ become available.
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`

 ---
-## Example - Online Benchmark
+<details>
+<summary><b>🚀 Example - Online Benchmark</b></summary>
+
+<br/>

 First start serving your model

@@ -130,7 +133,8 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```

-### Custom Dataset
+**Custom Dataset**
+
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl

 ```
@@ -162,7 +166,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile

 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.

-### VisionArena Benchmark for Vision Language Models
+**VisionArena Benchmark for Vision Language Models**

 ```bash
 # need a model with vision capability here
@@ -180,7 +184,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 1000
 ```

-### InstructCoder Benchmark with Speculative Decoding
+**InstructCoder Benchmark with Speculative Decoding**

 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -197,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \
    --num-prompts 2048
 ```

-### Other HuggingFaceDataset Examples
+**Other HuggingFaceDataset Examples**

 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
@@ -251,7 +255,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
    --num-prompts 80
 ```

-### Running With Sampling Parameters
+**Running With Sampling Parameters**

 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@@ -269,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 10
 ```

---
-## Example - Offline Throughput Benchmark
+**Running With Ramp-Up Request Rate**
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
+</details>
+
+<details>
+<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
+
+<br/>

 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@@ -288,7 +311,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```

-### VisionArena Benchmark for Vision Language Models
+**VisionArena Benchmark for Vision Language Models**

 ``` bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@@ -308,7 +331,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

-### InstructCoder Benchmark with Speculative Decoding
+**InstructCoder Benchmark with Speculative Decoding**

 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@@ -332,7 +355,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```

-### Other HuggingFaceDataset Examples
+**Other HuggingFaceDataset Examples**

 **`lmms-lab/LLaVA-OneVision-Data`**

@@ -371,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \
  --num-prompts 10
 ```

-### Benchmark with LoRA Adapters
+**Benchmark with LoRA Adapters**

 ``` bash
 # download dataset
@@ -387,3 +410,196 @@ python3 vllm/benchmarks/benchmark_throughput.py \
  --enable-lora \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```
+
+</details>
+
+<details>
+<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+**Server Setup**
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+```
+
+**JSON Schema Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Grammar-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Regex-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**Choice-based Generation Benchmark**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+**XGrammar Benchmark Dataset**
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+</details>
+
+<details>
+<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+**Basic Long Document QA Test**
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+**Different Repeat Modes**
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+</details>
+
+<details>
+<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
+
+<br/>
+
+Benchmark the efficiency of automatic prefix caching.
+
+**Fixed Prompt with Prefix Caching**
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+**ShareGPT Dataset with Prefix Caching**
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+</details>
+
+<details>
+<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
+
+<br/>
+
+Benchmark the performance of request prioritization in vLLM.
+
+**Basic Prioritization Test**
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+**Multiple Sequences per Prompt**
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```
+
+</details>
--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
@@ -10,6 +10,7 @@
 # 3. Set variables (ALL REQUIRED)
 #   BASE: your directory for vllm repo
 #   MODEL: the model served by vllm
+#   SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
 #   TP: ways of tensor parallelism
 #   DOWNLOAD_DIR: directory to download and load model weights.
 #   INPUT_LEN: request input len
@@ -34,6 +35,7 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 BASE=""
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
+SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"

 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
+PROFILE_PATH="$LOG_FOLDER/profile"

 echo "result file: $RESULT"
 echo "model: $MODEL"

 rm -rf $LOG_FOLDER
+rm -rf $PROFILE_PATH
 mkdir -p $LOG_FOLDER
+mkdir -p $PROFILE_PATH

 cd "$BASE/vllm"

@@ -70,10 +75,11 @@ start_server() {
    local max_num_seqs=$2
    local max_num_batched_tokens=$3
    local vllm_log=$4
+    local profile_dir=$5
    
    pkill -f vllm

-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
        --disable-log-requests \
        --port 8004 \
        --gpu-memory-utilization $gpu_memory_utilization \
@@ -105,19 +111,37 @@ start_server() {
    fi
 }

+update_best_profile() {
+    local profile_dir=$1
+    local profile_index=$2
+    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
+    selected_profile_file=
+    if [[ "$SYSTEM" == "TPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
+    fi 
+    if [[ "$SYSTEM" == "GPU" ]]; then
+        selected_profile_file="${sorted_paths[$profile_index]}"
+    fi 
+    rm -f $PROFILE_PATH/*
+    cp $selected_profile_file $PROFILE_PATH
+}
+
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
+    mkdir -p $profile_dir
    pkill -f vllm
+    local profile_index=0

    echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
+    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -144,7 +168,8 @@ run_benchmark() {
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
-        --port 8004 &> "$bm_log"
+        --port 8004 \
+        --profile &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -158,6 +183,7 @@ run_benchmark() {
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
+            profile_index=$((profile_index+1))
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
@@ -195,6 +221,12 @@ run_benchmark() {
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
+            if [[ "$SYSTEM" == "TPU" ]]; then
+                update_best_profile "$profile_dir/plugins/profile" $profile_index
+            fi
+            if [[ "$SYSTEM" == "GPU" ]]; then
+                update_best_profile "$profile_dir" $profile_index
+            fi
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
    done
 done
 echo "finish permutations"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -404,8 +404,14 @@ async def async_request_openai_chat_completions(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")

-                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -349,11 +349,12 @@ class RandomDataset(BenchmarkDataset):
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
            # the encoded sequence is truncated before being decode again.
+            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
-                : input_lens[i]
+                :total_input_len
            ]
            prompt = tokenizer.decode(re_encoded_sequence)
-            total_input_len = prefix_len + int(input_lens[i])
+            total_input_len = len(re_encoded_sequence)
            requests.append(
                SampleRequest(
                    prompt=prompt,

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -123,7 +123,7 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, results)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
        "requests till completion."
@@ -171,6 +171,12 @@ if __name__ == "__main__":
    # V1 enables prefix caching by default which skews the latency
    # numbers. We need to disable prefix caching by default.
    parser.set_defaults(enable_prefix_caching=False)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
        raise OSError(

--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -142,7 +142,7 @@ def main(args):
    )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the performance with or "
        "without automatic prefix caching."
@@ -192,5 +192,11 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -218,7 +218,7 @@ def main(args):
    )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the performance with or without "
        "automatic prefix caching."
@@ -268,5 +268,11 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -161,7 +161,7 @@ def main(args: argparse.Namespace):
            json.dump(results, f, indent=4)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument(
        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@@ -204,6 +204,12 @@ if __name__ == "__main__":
    )

    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -33,7 +33,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Optional
+from typing import Any, Literal, Optional

 import numpy as np
 from tqdm.asyncio import tqdm
@@ -107,14 +107,42 @@ class BenchmarkMetrics:
    percentiles_e2el_ms: list[tuple[float, float]]


+def _get_current_request_rate(
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
+    ramp_up_start_rps: Optional[int],
+    ramp_up_end_rps: Optional[int],
+    request_index: int,
+    total_requests: int,
+    request_rate: float,
+) -> float:
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
+        progress = request_index / max(total_requests - 1, 1)
+        if ramp_up_strategy == "linear":
+            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
+            return ramp_up_start_rps + increase
+        elif ramp_up_strategy == "exponential":
+            ratio = ramp_up_end_rps / ramp_up_start_rps
+            return ramp_up_start_rps * (ratio**progress)
+        else:
+            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
+    return request_rate
+
+
 async def get_request(
    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
-) -> AsyncGenerator[SampleRequest, None]:
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+) -> AsyncGenerator[tuple[SampleRequest, float], None]:
    """
    Asynchronously generates requests at a specified rate
-    with OPTIONAL burstiness.
+    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.

    Args:
        input_requests:
@@ -129,22 +157,44 @@ async def get_request(
            A lower burstiness value (0 < burstiness < 1) results
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
+         ramp_up_strategy (optional):
+            The ramp-up strategy. Can be "linear" or "exponential".
+            If None, uses constant request rate (specified by request_rate).
+        ramp_up_start_rps (optional):
+            The starting request rate for ramp-up.
+        ramp_up_end_rps (optional):
+            The ending request rate for ramp-up.
    """
-    input_requests: Iterable[SampleRequest] = iter(input_requests)
-
-    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
        f"A positive burstiness factor is expected, but given {burstiness}."
    )
-    theta = 1.0 / (request_rate * burstiness)
+    # Convert to list to get length for ramp-up calculations
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
+        input_requests = list(input_requests)
+
+    total_requests = len(input_requests)
+    request_index = 0

    for request in input_requests:
-        yield request
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )
+
+        yield request, current_request_rate

-        if request_rate == float("inf"):
+        request_index += 1
+
+        if current_request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue

+        theta = 1.0 / (current_request_rate * burstiness)
+
        # Sample the request interval from the gamma distribution.
        # If burstiness is 1, it follows exponential distribution.
        interval = np.random.gamma(shape=burstiness, scale=theta)
@@ -290,6 +340,9 @@ async def benchmark(
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
    extra_body: Optional[dict],
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -353,7 +406,15 @@ async def benchmark(

    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"

-    print(f"Traffic request rate: {request_rate}")
+    if ramp_up_strategy is not None:
+        print(
+            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
+            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
+            "the duration of the benchmark."
+        )
+    else:
+        print(f"Traffic request rate: {request_rate} RPS.")
+
    print(f"Burstiness factor: {burstiness} ({distribution})")
    print(f"Maximum request concurrency: {max_concurrency}")

@@ -373,7 +434,34 @@ async def benchmark(

    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate, burstiness):
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
+
+    async for request, current_request_rate in get_request(
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
+                last_int_rps = current_int_rps
+
        prompt, prompt_len, output_len, mm_content = (
            request.prompt,
            request.prompt_len,
@@ -397,11 +485,8 @@ async def benchmark(
            ignore_eos=ignore_eos,
            extra_body=extra_body,
        )
-        tasks.append(
-            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input, pbar=pbar)
-            )
-        )
+        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
+        tasks.append(asyncio.create_task(task))
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

    if profile:
@@ -466,7 +551,7 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
-        "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
+        "request_goodput": metrics.request_goodput if goodput_config_dict else None,
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@@ -477,6 +562,9 @@ async def benchmark(
        "errors": [output.error for output in outputs],
    }

+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
    def process_one_metric(
        # E.g., "ttft"
        metric_attribute_name: str,
@@ -610,6 +698,26 @@ def main(args: argparse.Namespace):
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
    tokenizer_mode = args.tokenizer_mode

+    # Validate ramp-up arguments
+    if args.ramp_up_strategy is not None:
+        if args.request_rate != float("inf"):
+            raise ValueError(
+                "When using ramp-up, do not specify --request-rate. "
+                "The request rate will be controlled by ramp-up parameters. "
+                "Please remove the --request-rate argument."
+            )
+        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
+            raise ValueError(
+                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
+                "--ramp-up-end-rps must be specified"
+            )
+        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
+            raise ValueError("Ramp-up start and end RPS must be non-negative")
+        if args.ramp_up_start_rps > args.ramp_up_end_rps:
+            raise ValueError("Ramp-up start RPS must be less than end RPS")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
+
    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
        base_url = f"{args.base_url}"
@@ -802,6 +910,9 @@ def main(args: argparse.Namespace):
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
+            ramp_up_strategy=args.ramp_up_strategy,
+            ramp_up_start_rps=args.ramp_up_start_rps,
+            ramp_up_end_rps=args.ramp_up_end_rps,
        )
    )

@@ -834,6 +945,11 @@ def main(args: argparse.Namespace):
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency

+        if args.ramp_up_strategy is not None:
+            result_json["ramp_up_strategy"] = args.ramp_up_strategy
+            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+
        # Merge with benchmark result
        result_json = {**result_json, **benchmark_result}

@@ -859,7 +975,10 @@ def main(args: argparse.Namespace):
            if args.max_concurrency is not None
            else ""
        )
-        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.ramp_up_strategy is not None:
+            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        else:
+            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
@@ -875,7 +994,7 @@ def main(args: argparse.Namespace):
        save_to_pytorch_benchmark_format(args, result_json, file_name)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the online serving throughput."
    )
@@ -1225,6 +1344,35 @@ if __name__ == "__main__":
        "script chooses a LoRA module at random.",
    )

-    args = parser.parse_args()
+    parser.add_argument(
+        "--ramp-up-strategy",
+        type=str,
+        default=None,
+        choices=["linear", "exponential"],
+        help="The ramp-up strategy. This would be used to "
+        "ramp up the request rate from initial RPS to final "
+        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
+        "over the duration of the benchmark.",
+    )
+    parser.add_argument(
+        "--ramp-up-start-rps",
+        type=int,
+        default=None,
+        help="The starting request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+    parser.add_argument(
+        "--ramp-up-end-rps",
+        type=int,
+        default=None,
+        help="The ending request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+
+    return parser

+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
            json.dump(results, outfile, indent=4)


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the online serving throughput."
    )
@@ -1034,5 +1034,10 @@ if __name__ == "__main__":
        help="Ratio of Structured Outputs requests",
    )

+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -97,7 +97,7 @@ def run_vllm(
        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
-        output_len = requests[0][2]
+        output_len = requests[0].expected_output_len
        for request in requests:
            assert request.expected_output_len == output_len
        start = time.perf_counter()
@@ -595,7 +595,7 @@ def validate_args(args):
        )


-if __name__ == "__main__":
+def create_argument_parser():
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument(
        "--backend",
@@ -717,6 +717,12 @@ if __name__ == "__main__":
    )

    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model

--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -19,7 +19,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_block_fp8_matmul,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@@ -117,14 +117,9 @@ def bench_fp8(
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)

-    def ceil_div(x: int, y: int) -> int:
-        return (x + y - 1) // y
-
-    block_scale_a = torch.rand(
-        (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
-    )
+    block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
    block_scale_b = torch.rand(
-        ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
+        cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
    )
    block_scale_a_M_major = block_scale_a.t().contiguous().t()
    block_scale_b_K_major = block_scale_b.t().contiguous().t()

--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@@ -11,6 +11,80 @@ from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from vllm.triton_utils import triton

+PROVIDER_CFGS = {
+    "torch-bf16": dict(enabled=True),
+    "fp8-tensor-w-token-a": dict(
+        w="tensor", a="token", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a": dict(
+        w="tensor", a="tensor", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-token-a": dict(
+        w="channel", a="token", no_a_quant=False, enabled=True
+    ),
+    "fp8-channel-w-tensor-a": dict(
+        w="channel", a="tensor", no_a_quant=False, enabled=False
+    ),
+    "fp8-tensor-w-token-a-noquant": dict(
+        w="tensor", a="token", no_a_quant=True, enabled=False
+    ),
+    "fp8-tensor-w-tensor-a-noquant": dict(
+        w="tensor", a="tensor", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-token-a-noquant": dict(
+        w="channel", a="token", no_a_quant=True, enabled=True
+    ),
+    "fp8-channel-w-tensor-a-noquant": dict(
+        w="channel", a="tensor", no_a_quant=True, enabled=False
+    ),
+}
+
+_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
+
+
+def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
+    if w_type == "tensor":
+        scale_b = torch.ones(1, device=device, dtype=torch.float32)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+    else:
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
+    return b_fp8.t(), scale_b_fp8
+
+
+def build_fp8_runner(cfg, a, b, dtype, device):
+    b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
+
+    scale_a_const = (
+        torch.ones(1, device=device, dtype=torch.float32)
+        if cfg["a"] == "tensor"
+        else None
+    )
+
+    if cfg["no_a_quant"]:
+        if cfg["a"] == "tensor":
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+        else:
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+
+        def run():
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+        return run
+
+    if cfg["a"] == "tensor":
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    else:
+
+        def run():
+            a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+            return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
+
+    return run
+

 @triton.testing.perf_report(
    triton.testing.Benchmark(
@@ -18,28 +92,8 @@ from vllm.triton_utils import triton
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
-        line_vals=[
-            "torch-bf16",
-            # "fp8-tensor-w-token-a",
-            "fp8-tensor-w-tensor-a",
-            "fp8-channel-w-token-a",
-            # "fp8-channel-w-tensor-a",
-            # "fp8-tensor-w-token-a-noquant",
-            "fp8-tensor-w-tensor-a-noquant",
-            "fp8-channel-w-token-a-noquant",
-            # "fp8-channel-w-tensor-a-noquant",
-        ],
-        line_names=[
-            "torch-bf16",
-            # "fp8-tensor-w-token-a",
-            "fp8-tensor-w-tensor-a",
-            "fp8-channel-w-token-a",
-            # "fp8-channel-w-tensor-a",
-            # "fp8-tensor-w-token-a-noquant",
-            "fp8-tensor-w-tensor-a-noquant",
-            "fp8-channel-w-token-a-noquant",
-            # "fp8-channel-w-tensor-a-noquant",
-        ],
+        line_vals=_enabled,
+        line_names=_enabled,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs FP8 GEMMs",
        args={},
@@ -50,144 +104,34 @@ def benchmark(batch_size, provider, N, K):
    device = "cuda"
    dtype = torch.bfloat16

-    # Create input tensors
    a = torch.randn((M, K), device=device, dtype=dtype)
    b = torch.randn((N, K), device=device, dtype=dtype)

    quantiles = [0.5, 0.2, 0.8]

-    if "torch-bf16" in provider:
+    if provider == "torch-bf16":
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
-
-    elif "fp8" in provider:
-        # Weights are always quantized ahead of time
-        if "noquant" in provider:
-            # For no quantization, we just measure the GEMM
-            if "tensor-w-token-a" in provider:
-                # Dynamic per-token quant for A, per-tensor quant for B
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
-                assert scale_b_fp8.numel() == 1
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                    a, use_per_token_if_dynamic=True
-                )
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "tensor-w-tensor-a" in provider:
-                # Static per-tensor quantization with fixed scales
-                # for both A and B
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                assert scale_b_fp8.numel() == 1
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-token-a" in provider:
-                # Static per-channel quantization for weights, per-token
-                # quant for A
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                    a, use_per_token_if_dynamic=True
-                )
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-tensor-a" in provider:
-                # Static per-channel quantization for weights, per-tensor
-                # quant for A
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-                a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-
-                def run_quant():
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-        else:
-            # In these cases, we quantize the activations during the GEMM call
-            if "tensor-w-token-a" in provider:
-                # Dynamic per-token quant for A, per-tensor quant for B
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
-                assert scale_b_fp8.numel() == 1
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                        a, use_per_token_if_dynamic=True
-                    )
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "tensor-w-tensor-a" in provider:
-                # Static per-tensor quantization with fixed scales
-                # for both A and B
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                assert scale_b_fp8.numel() == 1
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-token-a" in provider:
-                # Static per-channel quantization for weights, per-token
-                # quant for A
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
-                        a, use_per_token_if_dynamic=True
-                    )
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-            elif "channel-w-tensor-a" in provider:
-                # Static per-channel quantization for weights, per-tensor
-                # quant for A
-                scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
-                scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
-                b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
-                scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
-                assert scale_b_fp8.numel() == N
-
-                def run_quant():
-                    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
-                    return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
-
-        b_fp8 = b_fp8.t()
-
+    else:
+        cfg = PROVIDER_CFGS[provider]
+        run_quant = build_fp8_runner(cfg, a, b, dtype, device)
        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
            lambda: run_quant(), quantiles=quantiles
        )

-    # Calculate TFLOP/s, two flops per multiply-add
-    tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
-    return tflops(ms), tflops(max_ms), tflops(min_ms)
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)


 def prepare_shapes(args):
-    KN_model_names = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        assert model in WEIGHT_SHAPES
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+    out = []
+    for model, tp_size in itertools.product(args.models, args.tp_sizes):
+        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
+            KN[tp_dim] //= tp_size
            KN.append(model)
-            KN_model_names.append(KN)
-    return KN_model_names
+            out.append(KN)
+    return out


 if __name__ == "__main__":
@@ -197,21 +141,13 @@ if __name__ == "__main__":
        nargs="+",
        type=str,
        default=["meta-llama/Llama-3.1-8B-Instruct"],
-        choices=[*WEIGHT_SHAPES.keys()],
-        help="List of models to benchmark",
-    )
-    parser.add_argument(
-        "--tp-sizes",
-        nargs="+",
-        type=int,
-        default=[1],
-        help="List of tensor parallel sizes",
+        choices=list(WEIGHT_SHAPES.keys()),
    )
+    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
    args = parser.parse_args()

-    KN_model_names = prepare_shapes(args)
-    for K, N, model_name in KN_model_names:
-        print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
+    for K, N, model in prepare_shapes(args):
+        print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
        benchmark.run(
            print_data=True,
            show_plots=True,