Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

8d75f22e · zhuwenwen · ce888aa4 · 7d80c73d · 8d75f22e · 8d75f22e
Commit 8d75f22e authored Dec 13, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
+group: Plugins
+depends_on: 
+  - image-build
+steps:
+- label: Plugin Tests (2 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
+group: PyTorch
+depends_on: 
+  - image-build
+steps:
+- label: PyTorch Compilation Unit Tests
+  timeout_in_minutes: 30
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph Smoke Test
+  timeout_in_minutes: 30
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"
+
+- label: PyTorch Fullgraph
+  timeout_in_minutes: 40
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
\ No newline at end of file
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
+group: Quantization
+depends_on: 
+  - image-build
+steps:
+- label: Quantization
+  timeout_in_minutes: 90
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+
+- label: Quantized MoE Test (B200)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Quantized Models Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
+group: Samplers
+depends_on: 
+  - image-build
+steps:
+- label: Samplers Test
+  timeout_in_minutes: 75
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
+group: Tool use
+depends_on: 
+  - image-build
+steps:
+- label: OpenAI-Compatible Tool Use
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental]
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU)
+  depends_on: ~
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
--- a/.buildkite/test_areas/weight_loading.yaml
+++ b/.buildkite/test_areas/weight_loading.yaml
+group: Weight Loading
+depends_on: 
+  - image-build
+steps:
+- label: Weight Loading Multiple GPU  # 33min
+  timeout_in_minutes: 45
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -146,10 +146,10 @@ mkdocs.yaml @hmellor
 /requirements/kv_connectors.txt @NickLucche

 # Pooling models
-/examples/*/pooling/ @noooop
+/examples/pooling @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
-/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -14,6 +14,52 @@ pull_request_rules:
    comment:
      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"

+- name: comment-pre-commit-failure
+  description: Comment on PR when pre-commit check fails
+  conditions:
+    - status-failure=pre-commit
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the pre-commit checks have failed. Please run:
+
+        ```bash 
+        uv pip install pre-commit
+        pre-commit install
+        pre-commit run --all-files
+        ```
+
+        Then, commit the changes and push to your branch.
+
+        For future commits, `pre-commit` will run automatically on changed files before each commit.
+
+        > [!TIP]
+        > <details>
+        > <summary>Is <code>mypy</code> or <code>markdownlint</code> failing?</summary>
+        > <br/>
+        > <code>mypy</code> and <code>markdownlint</code> are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally:
+        >
+        > ```bash
+        > # For mypy (substitute "3.10" with the failing version if needed)
+        > pre-commit run --hook-stage manual mypy-3.10
+        > # For markdownlint
+        > pre-commit run --hook-stage manual markdownlint
+        > ```
+        > </details>
+
+- name: comment-dco-failure
+  description: Comment on PR when DCO check fails
+  conditions:
+    - status-failure=dco
+    - -closed
+    - -draft
+  actions:
+    comment:
+      message: |
+        Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this.
+
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
@@ -140,7 +186,7 @@ pull_request_rules:
      - files~=^tests/entrypoints/test_context.py
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
-      - files~=^vllm/entrypoints/harmony_utils.py
+      - files~=^vllm/entrypoints/openai/parser/harmony_utils.py
      - files~=^vllm/entrypoints/tool_server.py
      - files~=^vllm/entrypoints/tool.py
      - files~=^vllm/entrypoints/context.py
@@ -358,4 +404,4 @@ pull_request_rules:
  actions:
    label:
      add:
-        - kv-connector
\ No newline at end of file
+        - kv-connector
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,10 +13,10 @@ jobs:

    steps:
      - name: Checkout repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'


--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -12,7 +12,7 @@ jobs:
    timeout-minutes: 30

    steps:
-      - uses: actions/checkout@v6
+      - uses: actions/checkout@v6.0.1

      - uses: astral-sh/setup-uv@v7
        with:

--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,8 +16,8 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -7,13 +7,15 @@ on:

 jobs:
  close-issues-and-pull-requests:
+    # Prevents triggering on forks or other repos
+    if: github.repository == 'vllm-project/vllm'
    permissions:
      issues: write
      pull-requests: write
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu"
+       "csrc/quantization/cutlass_w4a8/w4a8_utils.cu"
+       )

    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -137,6 +137,7 @@ Compute Resources:
 - Alibaba Cloud
 - AMD
 - Anyscale
+- Arm
 - AWS
 - Crusoe Cloud
 - Databricks

--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -96,8 +96,9 @@ start_server() {
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
-        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
-            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_SERVER_DEV_MODE=1 \

--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_prefix_block_hash.py
+++ b/benchmarks/benchmark_prefix_block_hash.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simple benchmark to compare prefix-cache block hashing algorithms.
+
+Example:
+    python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import sys
+import time
+from collections.abc import Callable, Iterable, Sequence
+
+from vllm.utils.hashing import get_hash_fn_by_name
+from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash
+
+SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor")
+
+
+def _generate_blocks(
+    num_blocks: int, block_size: int, vocab_size: int, seed: int
+) -> list[list[int]]:
+    rng = random.Random(seed)
+    return [
+        [rng.randrange(vocab_size) for _ in range(block_size)]
+        for _ in range(num_blocks)
+    ]
+
+
+def _hash_all_blocks(
+    hash_fn: Callable[[object], bytes],
+    blocks: Iterable[Sequence[int]],
+) -> float:
+    parent_hash: BlockHash | None = None
+    start = time.perf_counter()
+    for block in blocks:
+        parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None)
+    end = time.perf_counter()
+    return end - start
+
+
+def _benchmark(
+    hash_algo: str,
+    blocks: list[list[int]],
+    trials: int,
+) -> tuple[float, float, float] | None:
+    try:
+        hash_fn = get_hash_fn_by_name(hash_algo)
+        init_none_hash(hash_fn)
+        timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)]
+    except ModuleNotFoundError as exc:
+        print(f"Skipping {hash_algo}: {exc}", file=sys.stderr)
+        return None
+
+    avg = statistics.mean(timings)
+    best = min(timings)
+    # throughput: tokens / second
+    tokens_hashed = len(blocks) * len(blocks[0])
+    throughput = tokens_hashed / best
+    return avg, best, throughput
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.")
+    parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.")
+    parser.add_argument(
+        "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)."
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed.")
+    parser.add_argument(
+        "--trials", type=int, default=5, help="Number of timed trials per algorithm."
+    )
+    parser.add_argument(
+        "--algorithms",
+        nargs="+",
+        default=SUPPORTED_ALGOS,
+        choices=SUPPORTED_ALGOS,
+        help="Hash algorithms to benchmark.",
+    )
+    args = parser.parse_args()
+
+    blocks = _generate_blocks(
+        args.num_blocks, args.block_size, args.vocab_size, args.seed
+    )
+    print(
+        f"Benchmarking {len(args.algorithms)} algorithms on "
+        f"{args.num_blocks} blocks (block size={args.block_size})."
+    )
+
+    for algo in args.algorithms:
+        result = _benchmark(algo, blocks, args.trials)
+        if result is None:
+            continue
+
+        avg, best, throughput = result
+        print(
+            f"{algo:14s} avg: {avg:.6f}s  best: {best:.6f}s  "
+            f"throughput: {throughput / 1e6:.2f}M tokens/s"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -963,8 +963,7 @@ def create_argument_parser():
    parser.add_argument(
        "--profile",
        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use vLLM Profiling. --profiler-config must be provided on the server.",
    )
    parser.add_argument(
        "--result-dir",

--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -14,6 +14,9 @@ from tqdm import tqdm

 import vllm._custom_ops as ops
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+)


 @dataclass
@@ -22,6 +25,7 @@ class bench_params_t:
    hidden_size: int
    add_residual: bool
    dtype: torch.dtype
+    group_size: list[int]

    def description(self):
        return (
@@ -29,6 +33,7 @@ class bench_params_t:
            f"x D {self.hidden_size} "
            f"x R {self.add_residual} "
            f"x DT {self.dtype}"
+            f"x GS {self.group_size}"
        )


@@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]:
    HIDDEN_SIZES = list(range(1024, 8129, 1024))
    ADD_RESIDUAL = [True, False]
    DTYPES = [torch.bfloat16, torch.float]
+    GROUP_SIZES = [[1, 64], [1, 128]]

-    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES)
    bench_params = list(
-        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations)
+        map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations)
    )
    return bench_params

@@ -52,6 +58,7 @@ def unfused_int8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -69,6 +76,7 @@ def unfused_fp8_impl(
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    # Norm
    torch_out = None
@@ -81,23 +89,63 @@ def unfused_fp8_impl(
    torch_out, _ = ops.scaled_fp8_quant(torch_out)


+def unfused_groupwise_fp8_impl(
+    rms_norm_layer: RMSNorm,
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = per_token_group_quant_fp8(
+        torch_out, group_size=group_size[1], use_ue8m0=False
+    )
+
+
 def fused_impl(
    rms_norm_layer: RMSNorm,  # this stores the weights
    x: torch.Tensor,
    residual: torch.Tensor | None,
    quant_dtype: torch.dtype,
+    group_size: list[int],
 ):
    out, _ = ops.rms_norm_dynamic_per_token_quant(
        x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual
    )


+def fused_groupwise_impl(
+    rms_norm_layer: RMSNorm,  # this stores the weights
+    x: torch.Tensor,
+    residual: torch.Tensor | None,
+    quant_dtype: torch.dtype,
+    group_size: list[int],
+):
+    out, _ = ops.rms_norm_per_block_quant(
+        x,
+        rms_norm_layer.weight,
+        1e-6,
+        quant_dtype,
+        group_size,
+        residual=residual,
+        is_scale_transposed=True,
+    )
+
+
 # Bench functions
 def bench_fn(
    rms_norm_layer: RMSNorm,
    x: torch.Tensor,
    residual: torch.Tensor,
    quant_dtype: torch.dtype,
+    group_size: list[int],
    label: str,
    sub_label: str,
    fn: Callable,
@@ -110,10 +158,11 @@ def bench_fn(
        "x": x,
        "residual": residual,
        "quant_dtype": quant_dtype,
+        "group_size": group_size,
        "fn": fn,
    }
    return TBenchmark.Timer(
-        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)",
        globals=globals,
        label=label,
        sub_label=sub_label,
@@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
+            params.group_size,
            label,
            sub_label,
            unfused_int8_impl,
@@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
+            params.group_size,
            label,
            sub_label,
            unfused_fp8_impl,
@@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.int8,
+            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
            x,
            residual,
            torch.float8_e4m3fn,
+            params.group_size,
            label,
            sub_label,
            fused_impl,
@@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu
        )
    )

+    # unfused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            unfused_groupwise_fp8_impl,
+            "unfused_groupwise_fp8_impl",
+        )
+    )
+
+    # fused groupwise fp8 impl.
+    timers.append(
+        bench_fn(
+            layer,
+            x,
+            residual,
+            torch.float8_e4m3fn,
+            params.group_size,
+            label,
+            sub_label,
+            fused_groupwise_impl,
+            "fused_groupwise_fp8_impl",
+        )
+    )
+
    print_timers(timers)

    return timers

--- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Any
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _per_token_group_quant_fp8_colmajor,
+    silu_mul_per_token_group_quant_fp8_colmajor,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+from .utils import ArgPool, Bench, CudaGraphBenchParams
+
+GROUP_SIZE = 128
+FLOAT8_T = torch.float8_e4m3fn
+
+
+def print_timers(timers: list[TMeasurement], cuda_graph_nops: int):
+    print(
+        f"Note : The timings reported above is for {cuda_graph_nops} "
+        "consecutive invocations of the benchmarking functions. "
+        f"Please divide by {cuda_graph_nops} for single invocation "
+        "timings."
+    )
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+class ImplType(Enum):
+    SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1
+    REFERENCE = 2
+
+    def get_impl(self):
+        if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return silu_mul_per_token_group_quant_fp8_colmajor
+        elif self == ImplType.REFERENCE:
+            return reference
+        raise ValueError(f"Unrecognized ImplType {self}")
+
+
+@dataclass
+class BenchmarkTensors:
+    input: torch.Tensor
+    output: torch.Tensor
+
+    # Reference act output tensor
+    ref_act_out: torch.Tensor
+    ref_quant_out: torch.Tensor
+
+    @staticmethod
+    def make(T: int, N: int) -> "BenchmarkTensors":
+        assert T % GROUP_SIZE == 0
+        assert N % (GROUP_SIZE * 2) == 0
+
+        input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
+
+        # silu_mul_per_token_group_quant_fp8_colmajor output.
+        output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to(
+            FLOAT8_T
+        )
+
+        # reference output.
+        ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda")
+        ref_quant_out = torch.empty(
+            (T, N // 2), dtype=torch.bfloat16, device="cuda"
+        ).to(FLOAT8_T)
+
+        return BenchmarkTensors(
+            input=input,
+            output=output,
+            ref_act_out=ref_act_out,
+            ref_quant_out=ref_quant_out,
+        )
+
+    @property
+    def T(self):
+        return self.input.size(0)
+
+    @property
+    def N(self):
+        return self.input.size(1)
+
+    def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]:
+        if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR:
+            return {
+                "input": self.input,
+                "output": self.output,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        elif impl_type == ImplType.REFERENCE:
+            return {
+                "input": self.input,
+                "act_out": self.ref_act_out,
+                "quant_out": self.ref_quant_out,
+                "use_ue8m0": is_deep_gemm_e8m0_used(),
+            }
+        raise ValueError(f"Unrecognized impl_type {impl_type}")
+
+
+def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool):
+    """
+    Reference triton quant kernel from,
+    vllm.model_executor.layers.quantization.utils.fp8_utils
+    """
+    assert quant_out.size() == x.size()
+    # Allocate the scale tensor column-major format.
+    shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1]
+    x_q = quant_out
+    x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2)
+
+    M = x.numel() // GROUP_SIZE
+    N = GROUP_SIZE
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+
+    finfo = torch.finfo(FLOAT8_T)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    _per_token_group_quant_fp8_colmajor[(M,)](
+        x,
+        x_q,
+        x_s,
+        GROUP_SIZE,
+        x.shape[1],
+        x.stride(0),
+        x_s.stride(1),
+        eps=1e-10,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        use_ue8m0=use_ue8m0,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return x_q, x_s
+
+
+def reference(
+    input: torch.Tensor,
+    act_out: torch.Tensor,
+    quant_out: torch.Tensor,
+    use_ue8m0: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    torch.ops._C.silu_and_mul(act_out, input)
+    return reference_quant(act_out, quant_out, use_ue8m0)
+
+
+def bench_impl(
+    bench_tensors: list[BenchmarkTensors], impl_type: ImplType
+) -> TMeasurement:
+    T = bench_tensors[0].T
+    N = bench_tensors[0].N
+
+    arg_pool_size = len(bench_tensors)
+    kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors]
+
+    # warmup
+    for kwargs in kwargs_list:
+        impl_type.get_impl()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    cuda_graph_params = None
+    cuda_graph_params = CudaGraphBenchParams(arg_pool_size)
+    timer = None
+    with Bench(
+        cuda_graph_params,
+        "silu-mul-quant",
+        f"num_tokens={T}, N={N}",
+        impl_type.name,
+        impl_type.get_impl(),
+        **kwargs,
+    ) as bench:
+        timer = bench.run()
+    return timer
+
+
+def test_correctness(T: int, N: int):
+    print(f"Testing num_tokens={T}, N={N} ...")
+
+    bench_tensor = BenchmarkTensors.make(T, N)
+
+    def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]:
+        return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl))
+
+    # reference output
+    ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
+
+    # test ouptut
+    out_q, out_s = output_from_impl(
+        ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+    )
+
+    torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32))
+    torch.testing.assert_close(ref_out_s, out_s)
+
+
+def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]:
+    timers = []
+    for N, T in product(Ns, Ts):
+        test_correctness(T, N)
+
+        bench_tensors: list[BenchmarkTensors] = [
+            BenchmarkTensors.make(T, N) for _ in range(arg_pool_size)
+        ]
+
+        silu_mul_quant_timer = bench_impl(
+            bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
+        )
+        timers.append(silu_mul_quant_timer)
+        reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE)
+        timers.append(reference_timer)
+
+        print_timers(
+            [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size
+        )
+
+    print_timers(timers, cuda_graph_nops=arg_pool_size)
+
+    return timers
+
+
+if __name__ == "__main__":
+    T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)]
+    N = [2048, 4096, 8192]
+
+    print(f"T = {T}, N = {N}")
+    run(T, N, arg_pool_size=8)