raw_vllm

fbeb8a6f · raojy · 2ca8867f · fbeb8a6f · fbeb8a6f · fbeb8a6f
Commit fbeb8a6f authored Mar 27, 2026 by raojy
20 changed files
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
+# DeepSeek DeepGEMM Kernels Benchmark
+This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
+## Setup
+You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
+```bash
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM
+cd DeepGEMM
+python setup.py install
+uv pip install -e .
+```
+## Usage
+```console
+python benchmark_fp8_block_dense_gemm.py
+INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
+===== STARTING FP8 GEMM BENCHMARK =====
+PyTorch version: 2.5.1+cu124
+CUDA version: 12.4
+Triton version: 3.1.0
+Using device: NVIDIA H100 80GB HBM3
+WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+===== PERFORMANCE COMPARISON =====
+DeepGEMM Implementation:
+------+-------+-------+-----------+--------+--------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
+------+-------+-------+-----------+--------+--------+
+|    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
+|    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
+|    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
+|   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
+|   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
+|   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
+|   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
+|   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
+|   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
+|  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
+|  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
+|  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
+| 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
+| 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
+| 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
+| 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
+------+-------+-------+-----------+--------+--------+
+vLLM Triton Implementation:
+------+-------+-------+-----------+--------+--------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
+------+-------+-------+-----------+--------+--------+--------------+
+|    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
+|    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
+|    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
+|   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
+|   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
+|   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
+|   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
+|   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
+|   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
+|  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
+|  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
+|  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
+| 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
+| 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
+| 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
+| 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
+------+-------+-------+-----------+--------+--------+--------------+
+vLLM CUTLASS Implementation:
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+|    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
+|    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
+|    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
+|   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
+|   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
+|   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
+|   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
+|   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
+|   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
+|  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
+|  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
+|  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
+| 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
+| 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
+| 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
+| 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+===== AVERAGE PERFORMANCE =====
+----------------+------------+----------+---------------+
+| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
+----------------+------------+----------+---------------+
+| DeepGEMM       | 310.98     | 1052.10  | 0.11          |
+| vLLM Triton    | 144.30     | 715.60   | 0.23          |
+| vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
+----------------+------------+----------+---------------+
+===== AVERAGE SPEEDUPS =====
+-----------------------------+--------------+
+| Comparison                  | Speedup      |
+-----------------------------+--------------+
+| DeepGEMM vs vLLM Triton     | 1.71x faster |
+| DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
+| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
+-----------------------------+--------------+
+===== ACCURACY COMPARISON =====
+----------------+-----------------------+
+| Implementation | Avg Diff vs Reference |
+----------------+-----------------------+
+| DeepGEMM       | 0.000684              |
+| vLLM Triton    | 0.000684              |
+| vLLM CUTLASS   | 0.000684              |
+----------------+-----------------------+
+```
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+import time
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+    w8a8_triton_block_scaled_mm,
+)
+from vllm.triton_utils import triton
+from vllm.utils.deep_gemm import (
+    calc_diff,
+    fp8_gemm_nt,
+    per_block_cast_to_fp8,
+)
+def benchmark_shape(
+    m: int,
+    n: int,
+    k: int,
+    warmup: int = 100,
+    repeat: int = 10000,
+    verbose: bool = False,
+) -> dict:
+    """Benchmark all implementations for a specific (m, n, k) shape."""
+    if verbose:
+        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
+    # Create test tensors
+    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
+    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
+    # Reference result in BF16
+    torch.cuda.synchronize()
+    C_ref = A @ B.t()
+    # Pre-quantize B for all implementations
+    # (weights can be pre-quantized offline)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
+    # Block size configuration
+    block_size = [128, 128]
+    # Pre-quantize A for all implementations
+    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True, tma_aligned_scales=True
+    )
+    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
+    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True
+    )
+    # === DeepGEMM Implementation ===
+    def deepgemm_gemm():
+        fp8_gemm_nt(
+            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
+        )
+        return C_deepgemm
+    # === vLLM Triton Implementation ===
+    def vllm_triton_gemm():
+        return w8a8_triton_block_scaled_mm(
+            A_vllm,
+            B_vllm,
+            A_scale_vllm,
+            B_scale_vllm,
+            block_size,
+            output_dtype=torch.bfloat16,
+        )
+    # === vLLM CUTLASS Implementation ===
+    def vllm_cutlass_gemm():
+        return ops.cutlass_scaled_mm(
+            A_vllm_cutlass,
+            B_vllm.T,
+            scale_a=A_scale_vllm_cutlass,
+            scale_b=B_scale_vllm.T,
+            out_dtype=torch.bfloat16,
+        )
+    # Run correctness check first
+    if verbose:
+        print("Running correctness check...")
+    C_deepgemm = deepgemm_gemm()
+    C_vllm_triton = vllm_triton_gemm()
+    C_vllm_cutlass = vllm_cutlass_gemm()
+    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
+    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
+    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
+    if verbose:
+        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
+        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
+        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
+        print(
+            "vLLM Triton vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
+        )
+        print(
+            "vLLM CUTLASS vs DeepGEMM difference: "
+            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
+        )
+    # Benchmark implementations
+    implementations = {
+        "DeepGEMM": deepgemm_gemm,
+        "vLLM Triton": vllm_triton_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm,
+    }
+    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
+    for name, func in implementations.items():
+        # Warmup
+        for _ in range(warmup):
+            func()
+            torch.cuda.synchronize()
+        # Timing loop
+        torch.cuda.synchronize()
+        start = time.time()
+        for _ in range(repeat):
+            func()
+        torch.cuda.synchronize()
+        end = time.time()
+        # Calculate timing and TFLOPS
+        avg_time_ms = (end - start) / repeat * 1000
+        avg_time_us = avg_time_ms * 1000
+        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
+        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
+        benchmark_results["implementations"][name] = {
+            "time_ms": avg_time_ms,
+            "time_us": avg_time_us,
+            "tflops": tflops,
+            "gb_s": gb_s,
+            "diff": {
+                "DeepGEMM": 0.0
+                if name == "DeepGEMM"
+                else calc_diff(func(), C_deepgemm),
+                "Reference": deepgemm_diff
+                if name == "DeepGEMM"
+                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
+            },
+        }
+        if verbose:
+            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
+    # Calculate speedups
+    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
+    for name, data in benchmark_results["implementations"].items():
+        if name != "DeepGEMM":
+            speedup = baseline / data["time_ms"]
+            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
+            if verbose:
+                print(
+                    f"DeepGEMM is {1 / speedup:.2f}x "
+                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
+                )
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
+    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
+    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
+        cutlass_vs_triton
+    )
+    if verbose:
+        print(
+            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
+            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
+        )
+    return benchmark_results
+def format_table_row(values, widths):
+    """Format a row with specified column widths."""
+    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
+def print_table(headers, rows, title=None):
+    """Print a table with headers and rows."""
+    if title:
+        print(f"\n{title}")
+    # Calculate column widths based on headers and data
+    widths = [
+        max(len(str(h)), max(len(str(row[i])) for row in rows))
+        for i, h in enumerate(headers)
+    ]
+    # Create separator line
+    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
+    # Print table
+    print(separator)
+    print(format_table_row(headers, widths))
+    print(separator)
+    for row in rows:
+        print(format_table_row(row, widths))
+    print(separator)
+def format_speedup(value):
+    """Format speedup value with indicator if it's faster or slower."""
+    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
+def run_benchmarks(verbose: bool = False):
+    """Run benchmarks for a set of common shapes."""
+    print("===== STARTING FP8 GEMM BENCHMARK =====")
+    # Make sure we're using the GPU
+    if not torch.cuda.is_available():
+        print("CUDA not available! Tests require GPU.")
+        return
+    # Print system information
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Triton version: {triton.__version__}")
+    print(f"Using device: {torch.cuda.get_device_name()}")
+    # Enable TF32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Set seeds for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+    # Define benchmark shapes (m, n, k)
+    shapes = [
+        (8, 4096, 7168),
+        (8, 7168, 18432),
+        (8, 18432, 7168),
+        (64, 4096, 7168),
+        (64, 7168, 18432),
+        (64, 18432, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 18432),
+        (128, 18432, 7168),
+        (1024, 4096, 7168),
+        (1024, 18432, 7168),
+        (2048, 4096, 7168),
+        (4096, 4096, 7168),
+    ]
+    shapes = [
+        # (64, 2112, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (64, 4096, 7168),
+        (64, 7168, 2048),
+        # (128, 2112, 7168),
+        (128, 24576, 1536),
+        (128, 32768, 512),
+        (128, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 2048),
+        # (4096, 2112, 7168),
+        (4096, 24576, 1536),
+        (4096, 32768, 512),
+        (4096, 7168, 16384),
+        (4096, 4096, 7168),
+        (4096, 7168, 2048),
+    ]
+    all_results = []
+    for m, n, k in shapes:
+        result = benchmark_shape(m, n, k, verbose=verbose)
+        all_results.append(result)
+    # Print results in a nicely formatted table
+    print("\n===== PERFORMANCE COMPARISON =====")
+    # Print DeepGEMM table
+    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
+    deepgemm_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["DeepGEMM"]
+        deepgemm_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+            ]
+        )
+    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
+    # Print vLLM Triton table
+    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
+    triton_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM Triton"]
+        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
+        triton_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(speedup),
+            ]
+        )
+    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
+    # Print vLLM CUTLASS table
+    cutlass_headers = [
+        "m",
+        "n",
+        "k",
+        "Time (μs)",
+        "TFLOPS",
+        "GB/s",
+        "vs DeepGEMM",
+        "vs Triton",
+    ]
+    cutlass_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM CUTLASS"]
+        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
+        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
+        cutlass_rows.append(
+            [
+                shape["m"],
+                shape["n"],
+                shape["k"],
+                f"{impl_data['time_us']:.1f}",
+                f"{impl_data['tflops']:.1f}",
+                f"{impl_data['gb_s']:.1f}",
+                format_speedup(vs_deepgemm),
+                format_speedup(vs_triton),
+            ]
+        )
+    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
+    # Calculate and print averages
+    print("\n===== AVERAGE PERFORMANCE =====")
+    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
+    avg_metrics = {
+        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
+    }
+    for result in all_results:
+        for impl in implementations:
+            impl_data = result["implementations"][impl]
+            avg_metrics[impl]["tflops"] += impl_data["tflops"]
+            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
+            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
+    num_shapes = len(all_results)
+    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
+    avg_rows = []
+    for impl in implementations:
+        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
+        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
+        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
+        avg_rows.append(
+            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
+        )
+    print_table(avg_headers, avg_rows)
+    # Calculate average speedups
+    avg_speedups = {
+        "DeepGEMM vs vLLM Triton": 0,
+        "DeepGEMM vs vLLM CUTLASS": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0,
+    }
+    for result in all_results:
+        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
+        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
+        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
+            vllm_triton_time / vllm_cutlass_time
+        )
+    print("\n===== AVERAGE SPEEDUPS =====")
+    speedup_headers = ["Comparison", "Speedup"]
+    speedup_rows = []
+    for comparison, total in avg_speedups.items():
+        avg_speedup = total / num_shapes
+        status = "faster" if avg_speedup > 1 else "slower"
+        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
+    print_table(speedup_headers, speedup_rows)
+    # Average accuracy comparison
+    print("\n===== ACCURACY COMPARISON =====")
+    avg_diff = {impl: 0 for impl in implementations}
+    for result in all_results:
+        for impl in implementations:
+            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
+    diff_headers = ["Implementation", "Avg Diff vs Reference"]
+    diff_rows = []
+    for impl in implementations:
+        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
+    print_table(diff_headers, diff_rows)
+if __name__ == "__main__":
+    run_benchmarks(verbose=False)
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import pickle
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import pandas as pd
+import regex as re
+import seaborn as sns
+from torch.utils.benchmark import Measurement as TMeasurement
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("filename", type=str)
+    args = parser.parse_args()
+    with open(args.filename, "rb") as f:
+        data = pickle.load(f)
+        raw_results: list[TMeasurement] = data["results"]
+    results = defaultdict(lambda: list())
+    for v in raw_results:
+        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
+        if result is not None:
+            KN = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
+        if result is not None:
+            M = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        kernel = v.task_spec.description
+        results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median})
+    rows = int(math.ceil(len(results) / 2))
+    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
+    axs = axs.flatten()
+    for axs_idx, (shape, data) in enumerate(results.items()):
+        plt.sca(axs[axs_idx])
+        df = pd.DataFrame(data)
+        sns.lineplot(
+            data=df,
+            x="batch_size",
+            y="median",
+            hue="kernel",
+            style="kernel",
+            markers=True,
+            dashes=False,
+            palette="Dark2",
+        )
+        plt.title(f"Shape: {shape}")
+        plt.ylabel("time (median, s)")
+    plt.tight_layout()
+    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
+pandas
\ No newline at end of file
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+from collections.abc import Callable, Iterable
+from typing import Any
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+    values: Iterable[Any]
+    def __getitem__(self, index):
+        return self.values[index]
+class Bench:
+    class ArgsIterator:
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+        def reset(self):
+            self.idx = 0
+        @property
+        def n_args(self):
+            return self.n
+    def __init__(
+        self,
+        cuda_graph_params: CudaGraphBenchParams | None,
+        label: str,
+        sub_label: str,
+        description: str,
+        fn: Callable,
+        *args,
+        **kwargs,
+    ):
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list)
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+        # benchmark run params
+        self.min_run_time = 1
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple(
+                [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]]
+            )
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+        return args_list, kwargs_list
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {"g": self.g}
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = """
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    """
+            stmt = """
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    """
+            globals = {"fn": self.fn, "args_iterator": self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+            setup = ""
+            stmt = """
+                    fn(*args, **kwargs)
+                   """
+            globals = {"fn": self.fn, "args": args, "kwargs": kwargs}
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-3.1-405b-hf": [
+        ([16384, 18432], 1),
+        ([16384, 16384], 0),
+        ([16384, 106496], 1),
+        ([53248, 16384], 0),
+    ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+    "CohereLabs/c4ai-command-a-03-2025": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 73728], 1),
+        ([36864, 12288], 0),
+    ],
+}
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
+# Benchmark KV Cache Offloading with Multi-Turn Conversations
+The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt`
+First start serving your model
+```bash
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+vllm serve $MODEL_PATH --served-model-name Llama
+```
+The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
+## Synthetic Multi-Turn Conversations
+Download the following text file (used for generation of synthetic conversations)
+```bash
+wget https://www.gutenberg.org/ebooks/1184.txt.utf-8
+mv 1184.txt.utf-8 pg1184.txt
+```
+The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`).
+But you may use other text files if you prefer (using this specific file is not required).
+Then run the benchmarking script
+```bash
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
+--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
+```
+You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
+If successful, you will see the following output
+```bash
+----------------------------------------------------------------------------------------------------
+Statistics summary:
+runtime_sec = 215.810
+requests_per_sec = 0.769
+----------------------------------------------------------------------------------------------------
+                   count     mean     std      min      25%      50%      75%      90%      99%      max
+ttft_ms            166.0    78.22   67.63    45.91    59.94    62.26    64.43    69.66   353.18   567.54
+tpot_ms            166.0    25.37    0.57    24.40    25.07    25.31    25.50    25.84    27.50    28.05
+latency_ms         166.0  2591.07  326.90  1998.53  2341.62  2573.01  2860.10  3003.50  3268.46  3862.94
+input_num_turns    166.0     7.43    4.57     1.00     3.00     7.00    11.00    13.00    17.00    17.00
+input_num_tokens   166.0  2006.20  893.56   522.00  1247.75  2019.00  2718.00  3233.00  3736.45  3899.00
+output_num_tokens  166.0   100.01   11.80    80.00    91.00    99.00   109.75   116.00   120.00   120.00
+output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75   115.00   119.00   119.00
+----------------------------------------------------------------------------------------------------
+```
+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+### JSON configuration file for synthetic conversations generation
+The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
+When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
+The file `generate_multi_turn.json` is an example file.
+The file must contain the sections `prompt_input` and `prompt_output`.
+The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
+* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
+The final value will always be rounded to an even number so each user turn has a reply.
+* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
+* `num_tokens` - Total token length of each **user** message (one turn).
+The `prompt_output` section must contain `num_tokens`:
+* `num_tokens` - Total token length of each **assistant** message (one turn).
+### Random distributions for synthetic conversations generation
+When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
+every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
+The distribution determines how to randomly sample values for the field.
+The available distributions are listed below.
+**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
+Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
+#### constant
+```json
+{
+    "distribution": "constant",
+    "value": 500
+}
+```
+* `value` - the fixed integer value (always returns the same number).
+#### uniform
+```json
+{
+    "distribution": "uniform",
+    "min": 12,
+    "max": 18
+}
+```
+* `min` - minimum value (inclusive).
+* `max` - maximum value (inclusive), should be equal or larger than min.
+#### lognormal
+```json
+{
+    "distribution": "lognormal",
+    "average": 1000,
+    "max": 5000
+}
+```
+You can parameterize the lognormal distribution in one of two ways:
+Using the average and optional median ratio:
+* `average` - target average value of the distribution.
+* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
+Using the parameters of the underlying normal distribution:
+* `mean` - mean of the underlying normal distribution.
+* `sigma` - standard deviation of the underlying normal distribution.
+#### zipf
+```json
+{
+    "distribution": "zipf",
+    "alpha": 1.2,
+    "max": 100
+}
+```
+* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
+#### poisson
+```json
+{
+    "distribution": "poisson",
+    "alpha": 10,
+    "max": 50
+}
+```
+* `alpha` - expected value (λ). Also the variance of the distribution.
+## ShareGPT Conversations
+To run with the ShareGPT data, download the following ShareGPT dataset:
+`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json`
+Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py`
+```bash
+python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128
+```
+The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles.
+The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed).
+Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`.
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from statistics import mean
+from typing import Any, NamedTuple
+import numpy as np  # type: ignore
+import pandas as pd  # type: ignore
+from bench_utils import (
+    TEXT_SEPARATOR,
+    Color,
+    logger,
+)
+from tqdm import tqdm
+from transformers import AutoTokenizer  # type: ignore
+# Conversation ID is a string (e.g: "UzTK34D")
+ConvId = str
+# A list of dicts (dicts with keys "id" and "messages")
+ShareGptConversations = list[dict[str, Any]]
+# A list of dicts (dicts with keys "role" and "content")
+MessagesList = list[dict[str, str]]
+# Map conversation ID to conversation messages
+ConversationsMap = list[ConvId, MessagesList]
+class Distribution(ABC):
+    @abstractmethod
+    def sample(self, size: int = 1) -> np.ndarray:
+        pass
+class UniformDistribution(Distribution):
+    def __init__(
+        self,
+        min_val: int | float,
+        max_val: int | float,
+        is_integer: bool = True,
+    ) -> None:
+        self.min_val = min_val
+        self.max_val = max_val
+        self.is_integer = is_integer
+    def sample(self, size: int = 1) -> np.ndarray:
+        if self.is_integer:
+            return np.random.randint(
+                int(self.min_val), int(self.max_val + 1), size=size
+            )
+        else:
+            return np.random.uniform(self.min_val, self.max_val, size=size)
+    def __repr__(self) -> str:
+        return f"UniformDistribution[{self.min_val}, {self.max_val}]"
+class ConstantDistribution(Distribution):
+    def __init__(self, value: int | float) -> None:
+        self.value = value
+        self.max_val = value
+    def sample(self, size: int = 1) -> np.ndarray:
+        return np.full(shape=size, fill_value=self.value)
+    def __repr__(self) -> str:
+        return f"Constant[{self.value}]"
+class ZipfDistribution(Distribution):
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+        self.alpha = alpha
+        self.max_val = max_val
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.zipf(self.alpha, size=size)
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+        return samples
+    def __repr__(self) -> str:
+        return f"ZipfDistribution[{self.alpha}]"
+class PoissonDistribution(Distribution):
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
+        self.alpha = alpha
+        self.max_val = max_val
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.poisson(self.alpha, size=size)
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+        return samples
+    def __repr__(self) -> str:
+        return f"PoissonDistribution[{self.alpha}]"
+class LognormalDistribution(Distribution):
+    def __init__(
+        self,
+        mean: float | None = None,
+        sigma: float | None = None,
+        average: int | None = None,
+        median_ratio: float | None = None,
+        max_val: int | None = None,
+    ) -> None:
+        self.average = average
+        self.median_ratio = median_ratio
+        self.max_val = max_val
+        if average is not None:
+            if average < 1:
+                raise ValueError("Lognormal average must be positive")
+            if mean or sigma:
+                raise ValueError(
+                    "When using lognormal average, you can't provide mean/sigma"
+                )
+            if self.median_ratio is None:
+                # Default value that provides relatively wide range of values
+                self.median_ratio = 0.85
+            # Calculate mean/sigma of np.random.lognormal based on the average
+            mean, sigma = self._generate_lognormal_by_median(
+                target_average=self.average, median_ratio=self.median_ratio
+            )
+        else:
+            if mean is None or sigma is None:
+                raise ValueError(
+                    "Must provide both mean and sigma if average is not used"
+                )
+            if mean <= 0 or sigma < 0:
+                raise ValueError(
+                    "Lognormal mean must be positive and sigma must be non-negative"
+                )
+        # Mean and standard deviation of the underlying normal distribution
+        # Based on numpy.random.lognormal
+        self.mean = mean
+        self.sigma = sigma
+    @staticmethod
+    def _generate_lognormal_by_median(
+        target_average: int, median_ratio: float
+    ) -> tuple[float, float]:
+        """
+        Compute (mu, sigma) for a lognormal distribution given:
+        - a target average (mean of the distribution)
+        - a ratio of median / mean (controls skewness), assume mean > median
+        Background:
+        If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
+        * mean(X)   = exp(mu + sigma^2 / 2)
+        * median(X) = exp(mu)
+        So:
+        median / mean = exp(mu) / exp(mu + sigma^2 / 2)
+                      = exp(-sigma^2 / 2)
+        Rearranging:
+        sigma^2 = 2 * ln(mean / median)
+        mu      = ln(median)
+        This gives a unique (mu, sigma) for any valid mean and median.
+        """
+        # Check input validity: median must be smaller than mean
+        if median_ratio <= 0 or median_ratio >= 1:
+            raise ValueError("median_ratio must be in range (0, 1)")
+        target_median = target_average * median_ratio
+        # Solve sigma^2 = 2 * ln(mean / median)
+        sigma = np.sqrt(2 * np.log(target_average / target_median))
+        mu = np.log(target_median)
+        return mu, sigma
+    def sample(self, size: int = 1) -> np.ndarray:
+        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
+        if self.average is not None:
+            # Scale to average
+            samples *= self.average / samples.mean()
+        if self.max_val:
+            samples = np.minimum(samples, self.max_val)
+        return np.round(samples).astype(int)
+    def __repr__(self) -> str:
+        if self.average:
+            return (
+                f"LognormalDistribution[{self.average}, "
+                f"{self.median_ratio}, {self.max_val}]"
+            )
+        return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
+class GenConvArgs(NamedTuple):
+    num_conversations: int
+    text_files: list[str]
+    input_num_turns: Distribution
+    input_common_prefix_num_tokens: Distribution
+    input_prefix_num_tokens: Distribution
+    input_num_tokens: Distribution
+    output_num_tokens: Distribution
+    print_stats: bool
+def verify_field_exists(
+    conf: dict, field_name: str, section: str, subsection: str
+) -> None:
+    if field_name not in conf:
+        raise ValueError(
+            f"Missing field '{field_name}' in {section=} and {subsection=}"
+        )
+def get_random_distribution(
+    conf: dict, section: str, subsection: str, optional: bool = False
+) -> Distribution:
+    # section can be "prompt_input" or "prompt_output" (both required)
+    conf = conf[section]
+    if optional and subsection not in conf:
+        # Optional subsection, if not found assume the value is always 0
+        return ConstantDistribution(0)
+    # subsection can be "num_turns", "num_tokens" or "prefix_num_tokens"
+    if subsection not in conf:
+        raise ValueError(f"Missing subsection {subsection} in section {section}")
+    conf = conf[subsection]
+    distribution = conf.get("distribution")
+    if distribution is None:
+        raise ValueError(
+            f"Missing field 'distribution' in {section=} and {subsection=}"
+        )
+    if distribution == "constant":
+        verify_field_exists(conf, "value", section, subsection)
+        return ConstantDistribution(conf["value"])
+    elif distribution == "zipf":
+        verify_field_exists(conf, "alpha", section, subsection)
+        max_val = conf.get("max", None)
+        return ZipfDistribution(conf["alpha"], max_val=max_val)
+    elif distribution == "poisson":
+        verify_field_exists(conf, "alpha", section, subsection)
+        max_val = conf.get("max", None)
+        return PoissonDistribution(conf["alpha"], max_val=max_val)
+    elif distribution == "lognormal":
+        max_val = conf.get("max", None)
+        if "average" in conf:
+            # Infer lognormal mean/sigma (numpy) from input average
+            median_ratio = conf.get("median_ratio", None)
+            return LognormalDistribution(
+                average=conf["average"], median_ratio=median_ratio, max_val=max_val
+            )
+        # Use mean/sigma directly (for full control over the distribution)
+        verify_field_exists(conf, "mean", section, subsection)
+        verify_field_exists(conf, "sigma", section, subsection)
+        return LognormalDistribution(
+            mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
+        )
+    elif distribution == "uniform":
+        verify_field_exists(conf, "min", section, subsection)
+        verify_field_exists(conf, "max", section, subsection)
+        min_value = conf["min"]
+        max_value = conf["max"]
+        assert min_value > 0
+        assert min_value <= max_value
+        is_integer = isinstance(min_value, int) and isinstance(max_value, int)
+        return UniformDistribution(min_value, max_value, is_integer)
+    else:
+        raise ValueError(f"Unknown distribution: {distribution}")
+def parse_input_json_file(conf: dict) -> GenConvArgs:
+    # Validate the input file
+    assert isinstance(conf, dict)
+    required_fields = [
+        "filetype",
+        "num_conversations",
+        "text_files",
+        "prompt_input",
+        "prompt_output",
+    ]
+    for field in required_fields:
+        assert field in conf, f"Missing field {field} in input {conf}"
+    assert conf["filetype"] == "generate_conversations"
+    assert conf["num_conversations"] > 0, "num_conversations should be larger than zero"
+    text_files = conf["text_files"]
+    assert isinstance(text_files, list), "Field 'text_files' should be a list"
+    assert len(text_files) > 0, (
+        "Field 'text_files' should be a list with at least one file"
+    )
+    # Parse the parameters for the prompt input/output workload
+    input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns")
+    input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens")
+    input_common_prefix_num_tokens = get_random_distribution(
+        conf, "prompt_input", "common_prefix_num_tokens", optional=True
+    )
+    input_prefix_num_tokens = get_random_distribution(
+        conf, "prompt_input", "prefix_num_tokens"
+    )
+    output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens")
+    print_stats: bool = conf.get("print_stats", False)
+    assert isinstance(print_stats, bool), (
+        "Field 'print_stats' should be either 'true' or 'false'"
+    )
+    args = GenConvArgs(
+        num_conversations=conf["num_conversations"],
+        text_files=text_files,
+        input_num_turns=input_num_turns,
+        input_common_prefix_num_tokens=input_common_prefix_num_tokens,
+        input_prefix_num_tokens=input_prefix_num_tokens,
+        input_num_tokens=input_num_tokens,
+        output_num_tokens=output_num_tokens,
+        print_stats=print_stats,
+    )
+    return args
+def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None:
+    # Collect statistics
+    conv_stats: list[dict[Any, Any]] = []
+    req_stats: list[int] = []
+    print("\nCollecting statistics...")
+    for messages in conversations.values():
+        # messages is a list of dicts
+        user_tokens: list[int] = []
+        assistant_tokens: list[int] = []
+        request_tokens: list[int] = []
+        req_tokens = 0
+        for m in messages:
+            content = m["content"]
+            num_tokens = len(tokenizer(content).input_ids)
+            if m["role"] == "user":
+                user_tokens.append(num_tokens)
+                # New user prompt including all chat history
+                req_tokens += num_tokens
+                request_tokens.append(req_tokens)
+            elif m["role"] == "assistant":
+                assistant_tokens.append(num_tokens)
+                # Update assistant answer
+                # (will be part of chat history for the next user prompt)
+                req_tokens += num_tokens
+        item_stats = {
+            "conversation_turns": len(messages),
+            "user_tokens": mean(user_tokens),
+            "assistant_tokens": mean(assistant_tokens),
+        }
+        conv_stats.append(item_stats)
+        req_stats.extend(request_tokens)
+    # Print statistics
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    df = pd.DataFrame(conv_stats)
+    print(df.describe(percentiles=percentiles).transpose())
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Request statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    df = pd.DataFrame(req_stats, columns=["request_tokens"])
+    print(df.describe(percentiles=percentiles).transpose())
+    print(TEXT_SEPARATOR)
+def generate_conversations(
+    args: GenConvArgs, tokenizer: AutoTokenizer
+) -> ConversationsMap:
+    # Text for all user prompts
+    # (text from the input text files will be appended to this line)
+    base_prompt_text = "Please rewrite the following text and add more content: "
+    base_prompt_token_count = len(
+        tokenizer.encode(base_prompt_text, add_special_tokens=False)
+    )
+    logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}")
+    logger.info(args)
+    list_of_tokens = []
+    for filename in args.text_files:
+        # Load text file that will be used to generate prompts
+        with open(filename) as file:
+            data = file.read()
+            tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
+            list_of_tokens.extend(tokens_in_file)
+        logger.info(
+            f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
+            f"total tokens so far: {len(list_of_tokens)}"
+        )
+    conversations: ConversationsMap = {}
+    conv_id = 0
+    # Generate number of turns for every conversation
+    turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations)
+    # Turn count should be at least 2 (one user prompt and one assistant answer)
+    turn_count = np.maximum(turn_count, 2)
+    # Round up to an even number (every user prompt should have an answer)
+    turn_count = turn_count + (turn_count % 2)
+    # Generate number of prefix tokens for every conversation
+    conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample(
+        args.num_conversations
+    )
+    # Used to reduce shared text between conversations
+    # (jump/skip over text sections between conversations)
+    base_offset = 0
+    # Common prefix size for all conversations (only 1 sample required)
+    common_prefix_text = ""
+    common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0]
+    if common_prefix_tokens > 0:
+        # Using "." at the end to separate sentences
+        common_prefix_text = (
+            tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "."
+        )
+        base_offset += common_prefix_tokens
+    for conv_id in tqdm(
+        range(args.num_conversations),
+        total=args.num_conversations,
+        desc="Generating conversations",
+        unit="conv",
+    ):
+        # Generate a single conversation
+        messages: MessagesList = []
+        nturns = turn_count[conv_id]
+        # User prompt token count per turn (with lower limit)
+        input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
+        input_token_count = np.maximum(input_token_count, base_prompt_token_count)
+        # Assistant answer token count per turn (with lower limit)
+        output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
+            int
+        )
+        output_token_count = np.maximum(output_token_count, 1)
+        user_turn = True
+        for turn_id in range(nturns):
+            if user_turn:
+                role = "user"
+                num_tokens = input_token_count[turn_id]
+                # Generate the user prompt,
+                # use a unique prefix (the conv_id) for each conversation
+                # (to avoid shared prefix between conversations)
+                content = f"{conv_id} is a nice number... "
+                if len(common_prefix_text) > 0 and turn_id == 0:
+                    content = common_prefix_text + content
+                # Update the number of tokens left for the content
+                num_tokens -= len(tokenizer.encode(content, add_special_tokens=False))
+                if turn_id == 0:
+                    prefix_num_tokens = conv_prefix_tokens[conv_id]
+                    if prefix_num_tokens > 0:
+                        # Add prefix text (context) to the first turn
+                        start_offset = base_offset
+                        end_offset = start_offset + prefix_num_tokens
+                        assert len(list_of_tokens) > end_offset, (
+                            "Not enough input text to generate "
+                            f"{prefix_num_tokens} tokens for the "
+                            f"prefix text ({start_offset=}, {end_offset=})"
+                        )
+                        content += f"{conv_id}, " + tokenizer.decode(
+                            list_of_tokens[start_offset:end_offset]
+                        )
+                        base_offset += prefix_num_tokens
+                # Add the actual user prompt/question after the prefix text
+                content += base_prompt_text
+                num_tokens -= base_prompt_token_count
+                if num_tokens > 0:
+                    # Add text from the input file (to reach the desired token count)
+                    start_offset = base_offset + turn_id * input_token_count.max()
+                    end_offset = start_offset + num_tokens
+                    assert len(list_of_tokens) > end_offset, (
+                        f"Not enough input text to generate {num_tokens} tokens "
+                        f"for the prompt ({start_offset=}, {end_offset=})"
+                    )
+                    # Convert tokens back to text
+                    content += tokenizer.decode(list_of_tokens[start_offset:end_offset])
+            else:
+                role = "assistant"
+                # This content will not be used as input to the LLM server
+                # (actual answers will be used instead).
+                # Content is only required to determine the min_tokens/max_tokens
+                # (inputs to the LLM server).
+                num_tokens = output_token_count[turn_id]
+                assert len(list_of_tokens) > num_tokens, (
+                    f"Not enough input text to generate {num_tokens} "
+                    "tokens for assistant content"
+                )
+                content = tokenizer.decode(list_of_tokens[:num_tokens])
+            # Append the user/assistant message to the list of messages
+            messages.append({"role": role, "content": content})
+            user_turn = not user_turn
+        # Add the new conversation
+        conversations[f"CONV_ID_{conv_id}"] = messages
+        # Increase base offset for the next conversation
+        base_offset += nturns
+    if args.print_stats:
+        print_conv_stats(conversations, tokenizer)
+    return conversations
+def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap:
+    conversations: ConversationsMap = {}
+    for item in input_list:
+        conv_id: str = item["id"]
+        assert isinstance(conv_id, str)
+        assert conv_id not in conversations, (
+            f"Conversation ID {conv_id} found more than once in the input"
+        )
+        messages: MessagesList = item["messages"]
+        assert isinstance(messages, list), (
+            f"Conversation messages should be a list (ID: {conv_id})"
+        )
+        assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})"
+        conversations[conv_id] = messages
+    logger.info(f"Using {len(conversations)} unique conversations (IDs)")
+    assert len(conversations) == len(input_list)
+    # Print statistics about the selected conversations
+    stats: list[dict[str, Any]] = []
+    for conv_data in conversations.values():
+        stats.append({"num_turns": len(conv_data)})
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
+    print(TEXT_SEPARATOR)
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
+    conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles)
+    print(conv_stats.transpose())
+    print(TEXT_SEPARATOR)
+    return conversations
+def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations:
+    output: ShareGptConversations = []
+    for conv_id, conv_data in input_dict.items():
+        new_item = {"id": conv_id, "messages": conv_data}
+        output.append(new_item)
+    return output
--- a/benchmarks/multi_turn/bench_utils.py
+++ b/benchmarks/multi_turn/bench_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from enum import Enum
+class Color(Enum):
+    RED = "\033[91m"
+    GREEN = "\033[92m"
+    BLUE = "\033[94m"
+    PURPLE = "\033[95m"
+    CYAN = "\033[96m"
+    YELLOW = "\033[93m"
+    RESET = "\033[0m"
+    def __str__(self):
+        return self.value
+TEXT_SEPARATOR = "-" * 100
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] - %(message)s",
+    datefmt="%d-%m-%Y %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import asyncio
+import json
+import logging
+import multiprocessing as mp
+import os
+import random
+import time
+from collections import Counter, deque
+from datetime import datetime
+from enum import Enum
+from http import HTTPStatus
+from statistics import mean
+from typing import NamedTuple
+import aiohttp  # type: ignore
+import numpy as np  # type: ignore
+import pandas as pd  # type: ignore
+from bench_dataset import (
+    ConversationsMap,
+    ConvId,
+    GenConvArgs,
+    MessagesList,
+    ShareGptConversations,
+    conversations_dict_to_list,
+    conversations_list_to_dict,
+    generate_conversations,
+    parse_input_json_file,
+)
+from bench_utils import TEXT_SEPARATOR, Color, logger
+from transformers import AutoTokenizer  # type: ignore
+NUM_TOKENS_FROM_DATASET = 0
+TERM_SIGNAL = None
+class ConversationSampling(str, Enum):
+    ROUND_ROBIN = "round_robin"
+    RANDOM = "random"
+    def __str__(self):
+        return self.value
+class ClientArgs(NamedTuple):
+    seed: int
+    max_num_requests: int | None
+    skip_first_turn: bool
+    max_turns: int | None
+    max_active_conversations: int
+    verbose: bool
+    print_content: bool
+    verify_output: bool
+    conversation_sampling: ConversationSampling
+    request_rate: float
+    max_retries: int
+class RequestArgs(NamedTuple):
+    chat_url: str
+    model: str
+    stream: bool
+    limit_min_tokens: int  # Use negative value for no limit
+    limit_max_tokens: int  # Use negative value for no limit
+    timeout_sec: int
+class BenchmarkArgs(NamedTuple):
+    url: str
+    num_clients: int
+    early_stop: bool
+class ServerResponse(NamedTuple):
+    valid: bool
+    ttft_ms: float  # time to first chunk
+    tpot_ms: float  # time per output chunk (one or more tokens)
+    latency_ms: float
+    start_time_ms: float
+    first_chunk: str  # first chunk of the content
+    content: str  # includes the first_chunk
+    num_chunks: int
+    def __str__(self) -> str:
+        return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}"  # noqa: E501
+class RequestStats(NamedTuple):
+    ttft_ms: float
+    tpot_ms: float
+    latency_ms: float
+    start_time_ms: float
+    input_num_turns: int
+    input_num_tokens: int
+    output_num_tokens: int
+    output_num_chunks: int
+    output_num_first_chunk_tokens: int
+    approx_cached_percent: float
+    conversation_id: str
+    client_id: int
+    def __str__(self) -> str:
+        return (
+            f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, "  # noqa: E501
+            f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), "  # noqa: E501
+            f"approx_cached_percent {self.approx_cached_percent:.2f}%"
+        )
+class MetricStats:
+    def __init__(self) -> None:
+        self.min: float | None = None
+        self.max: float | None = None
+        self.avg: float | None = None
+        self.sum = 0.0
+        self.count = 0
+    def update(self, value: float) -> None:
+        if self.min is None:
+            self.min = value
+        else:
+            self.min = min(self.min, value)
+        if self.max is None:
+            self.max = value
+        else:
+            self.max = max(self.max, value)
+        self.sum += value
+        self.count += 1
+        self.avg = self.sum / self.count
+    def __repr__(self) -> str:
+        if self.count == 0:
+            return "no data"
+        return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}"
+class MovingAverage:
+    def __init__(self, window_size: int) -> None:
+        self.window_size = window_size
+        self.window = np.zeros(window_size)
+        self.index = 0
+        self.sum = 0.0
+        self.count = 0
+        self.avg: float | None = None
+    def update(self, new_value: float) -> None:
+        if self.count < self.window_size:
+            # Filling up the window
+            self.sum += new_value
+            self.window[self.count] = new_value
+            self.count += 1
+        else:
+            # Window is full, start replacing old values
+            old_value = self.window[self.index]
+            self.sum = self.sum - old_value + new_value
+            self.window[self.index] = new_value
+            self.index = (self.index + 1) % self.window_size
+        self.avg = self.sum / self.count
+    def __repr__(self) -> str:
+        if self.count == 0:
+            return "no data"
+        return f"avg: {self.avg:>10.3f} ({self.count} samples)"
+class DebugStats:
+    def __init__(self, logger: logging.Logger, window_size: int) -> None:
+        self.logger = logger
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
+            "moving_avg_ttft_ms": MovingAverage(window_size),
+            "moving_avg_tpot_ms": MovingAverage(window_size),
+            "ttft_ms": MetricStats(),
+            "tpot_ms": MetricStats(),
+            "latency_ms": MetricStats(),
+            "input_num_turns": MetricStats(),
+            "input_num_tokens": MetricStats(),
+            "output_num_tokens": MetricStats(),
+        }
+    def update(self, data: RequestStats) -> None:
+        self.metrics["ttft_ms"].update(data.ttft_ms)
+        self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms)
+        self.metrics["tpot_ms"].update(data.tpot_ms)
+        self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms)
+        self.metrics["latency_ms"].update(data.latency_ms)
+        self.metrics["input_num_turns"].update(data.input_num_turns)
+        self.metrics["input_num_tokens"].update(data.input_num_tokens)
+        self.metrics["output_num_tokens"].update(data.output_num_tokens)
+    def print(self) -> None:
+        self.logger.info("-" * 50)
+        for k, v in self.metrics.items():
+            kv_info = f"[{k:25}] {v}"
+            self.logger.info(kv_info)
+        self.logger.info("-" * 50)
+def nanosec_to_millisec(value: float) -> float:
+    return value / 1000000.0
+def nanosec_to_sec(value: float) -> float:
+    return value / 1000000000.0
+async def send_request(
+    session: aiohttp.ClientSession,
+    messages: list[dict[str, str]],
+    chat_url: str,
+    model: str,
+    stream: bool = True,
+    min_tokens: int | None = None,
+    max_tokens: int | None = None,
+    timeout_sec: int = 120,
+) -> ServerResponse:
+    payload = {
+        "model": model,
+        "messages": messages,
+        "seed": 0,
+        "temperature": 0.0,
+    }
+    if stream:
+        payload["stream"] = True
+        payload["stream_options"] = {"include_usage": False}
+    if min_tokens is not None:
+        payload["min_tokens"] = min_tokens
+    if max_tokens is not None:
+        payload["max_tokens"] = max_tokens
+    headers = {"Content-Type": "application/json"}
+    # Calculate the timeout for the request
+    if max_tokens is not None:
+        # Assume TPOT of 200ms and use max_tokens to determine timeout
+        token_based_timeout = int(max_tokens * 0.2)
+        if token_based_timeout > timeout_sec:
+            timeout_sec = token_based_timeout
+            logger.info(
+                "Using timeout of %ds based on max_tokens %d",
+                timeout_sec,
+                max_tokens,
+            )
+    timeout = aiohttp.ClientTimeout(total=timeout_sec)
+    valid_response = True
+    ttft: float | None = None
+    chunk_delay: list[int] = []
+    latency: float | None = None
+    first_chunk = ""
+    generated_text = ""
+    start_time: int = time.perf_counter_ns()
+    most_recent_timestamp: int = start_time
+    async with session.post(
+        url=chat_url, json=payload, headers=headers, timeout=timeout
+    ) as response:
+        http_status = HTTPStatus(response.status)
+        if http_status == HTTPStatus.OK:
+            async for chunk_bytes in response.content:
+                chunk_bytes = chunk_bytes.strip()
+                if not chunk_bytes:
+                    continue
+                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                if chunk == "[DONE]":
+                    # End of stream
+                    latency = time.perf_counter_ns() - start_time
+                elif stream is False:
+                    data = json.loads(chunk)
+                    message = data["choices"][0]["message"]
+                    assert message["role"] == "assistant"
+                    generated_text += message["content"]
+                else:
+                    timestamp: int = time.perf_counter_ns()
+                    data = json.loads(chunk)
+                    # Delta is the new content/text/data
+                    delta = data["choices"][0]["delta"]
+                    if delta.get("content", None):
+                        if ttft is None:
+                            # First token
+                            first_token_time = time.perf_counter_ns()
+                            ttft = first_token_time - start_time
+                            first_chunk = delta["content"]
+                        else:
+                            # Decoding phase
+                            chunk_delay.append(timestamp - most_recent_timestamp)
+                        generated_text += delta["content"]
+                    most_recent_timestamp = timestamp
+        else:
+            valid_response = False
+            content = await response.text()
+            logger.warning(
+                f"{Color.YELLOW}Received HTTP status {http_status.value} "
+                f"({http_status.phrase}): {content}{Color.RESET}"
+            )
+    if latency is None:
+        latency = -1.0
+        if valid_response:
+            # Streaming is disabled, latency was not set
+            latency = time.perf_counter_ns() - start_time
+    if ttft is None:
+        # The response was a single chunk
+        ttft = latency
+    # Each chunk may include more than one token
+    tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0
+    num_chunks: int = len(chunk_delay)
+    sr = ServerResponse(
+        valid=valid_response,
+        ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0,
+        tpot_ms=nanosec_to_millisec(tpot),
+        latency_ms=nanosec_to_millisec(latency),
+        start_time_ms=nanosec_to_millisec(start_time),
+        first_chunk=first_chunk,
+        content=generated_text,
+        num_chunks=num_chunks,
+    )
+    return sr
+def get_short_string(input: str) -> str:
+    n = 20
+    if len(input) < 400:
+        return input
+    return f"{input[:n]}...{input[-n:]}"
+def get_token_count(tokenizer: AutoTokenizer, text: str) -> int:
+    return len(tokenizer(text, add_special_tokens=False).input_ids)
+def get_messages_token_count(
+    tokenizer: AutoTokenizer, messages: list[dict[str, str]]
+) -> int:
+    token_count = 0
+    for m in messages:
+        token_count += get_token_count(tokenizer, m["content"])
+    return token_count
+async def send_turn(
+    session: aiohttp.ClientSession,
+    client_id: int,
+    conv_id: str,
+    conversation_messages: MessagesList,
+    messages_to_use: int,
+    tokenizer: AutoTokenizer,
+    req_args: RequestArgs,
+    verbose: bool,
+    verify_output: bool,
+) -> RequestStats | None:
+    assert messages_to_use > 0
+    assert messages_to_use <= len(conversation_messages)
+    messages = conversation_messages[:messages_to_use]
+    # Index of the next message (the role should be "user")
+    index = messages_to_use - 1
+    # Verify that the message has only two keys, "role" and "content"
+    assert len(messages[index].keys()) == 2
+    assert "role" in messages[index] and "content" in messages[index]
+    assert messages[index]["role"] == "user", (
+        f"Failed on conversation ID {conv_id}, message role should be user"
+    )
+    if verbose:
+        print(
+            f"{Color.CYAN}Messages (conversation ID {conv_id},"
+            f" {len(messages)} turns):{Color.RESET}",
+            messages,
+        )
+    # None means that there is no upper/lower limit for the output token count
+    min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens
+    max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens
+    if len(conversation_messages) > messages_to_use:
+        # The conversation contains an assistant answer for the next user prompt
+        if (
+            min_tokens == NUM_TOKENS_FROM_DATASET
+            or max_tokens == NUM_TOKENS_FROM_DATASET
+        ):
+            # Compute number of tokens in the answer (from the input conversation)
+            assistant_answer = conversation_messages[messages_to_use]
+            answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"])
+            assert assistant_answer["role"] == "assistant"
+        if min_tokens == NUM_TOKENS_FROM_DATASET:
+            min_tokens = max(1, answer_num_tokens)
+        if max_tokens == NUM_TOKENS_FROM_DATASET:
+            max_tokens = max(1, answer_num_tokens)
+    # Send the current conversation to LLM and get a response
+    response: ServerResponse = await send_request(
+        session,
+        messages,
+        req_args.chat_url,
+        req_args.model,
+        req_args.stream,
+        min_tokens,
+        max_tokens,
+        req_args.timeout_sec,
+    )
+    if response.valid is False:
+        # Request failed
+        return None
+    # Compute number of tokens in input / output
+    input_num_tokens = get_messages_token_count(tokenizer, messages)
+    # Num tokens in the user's last question
+    question_num_tokens = get_token_count(tokenizer, messages[index]["content"])
+    # Num tokens in the history/context of the question
+    assert input_num_tokens >= question_num_tokens
+    history_num_tokens = input_num_tokens - question_num_tokens
+    # Num tokens in the LLM's answer (first chunk and full answer)
+    first_chunk_tokens = get_token_count(tokenizer, response.first_chunk)
+    output_content = response.content
+    output_num_tokens = get_token_count(tokenizer, output_content)
+    # Prefix caching approximated cached percent
+    approx_cached_percent = (
+        100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0
+    )
+    # Compute the correct TTFT and TPOT (based on tokens and not chunks).
+    # Required because multiple output tokens may be bundled in a single chunk.
+    if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens:
+        # More than one token and more than one chunk in the output
+        decode_ms = response.latency_ms - response.ttft_ms
+        decode_num_tokens = output_num_tokens - first_chunk_tokens
+        tpot_ms = decode_ms / decode_num_tokens
+    else:
+        # In this case: output_num_tokens == first_chunk_tokens
+        # Output was a single chunk (output_num_tokens > 1)
+        # or even a single token (output_num_tokens == 1)
+        tpot_ms = 0.0
+    if first_chunk_tokens > 1:
+        # First chunk had multiple tokens, adjust TTFT for a single token
+        delta_ms = (first_chunk_tokens - 1) * tpot_ms
+        ttft_ms = max(0.1, response.ttft_ms - delta_ms)
+    else:
+        # First chunk had only one token
+        ttft_ms = response.ttft_ms
+    rs = RequestStats(
+        ttft_ms=ttft_ms,
+        tpot_ms=tpot_ms,
+        latency_ms=response.latency_ms,
+        start_time_ms=response.start_time_ms,
+        input_num_turns=len(messages),
+        input_num_tokens=input_num_tokens,
+        output_num_tokens=output_num_tokens,
+        output_num_chunks=response.num_chunks,
+        output_num_first_chunk_tokens=first_chunk_tokens,
+        approx_cached_percent=approx_cached_percent,
+        conversation_id=conv_id,
+        client_id=client_id,
+    )
+    if verbose:
+        print(
+            f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}",
+            output_content,
+        )
+        print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}")
+        print("-" * 70)
+    # Save the LLM's answer (will be used as part of the context for the next user turn)
+    answer_index = messages_to_use
+    if len(conversation_messages) > answer_index:
+        assert conversation_messages[answer_index]["role"] == "assistant", (
+            f"Failed on conversation ID {conv_id}, message role should be assistant"
+        )
+        orig_content = conversation_messages[answer_index]["content"]
+        if verify_output:
+            # Compare the new answer to the answer from the input file
+            debug_info = (
+                f"LLM/dataset answers do not match ({conv_id}):"
+                f"\n'{get_short_string(output_content)}' (len: {len(output_content)}),"
+                f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})"
+            )
+            if orig_content != output_content:
+                raise ValueError(debug_info)
+        # Update the answer
+        conversation_messages[answer_index]["content"] = output_content
+    else:
+        # A user prompt that has no answer, add the answer as a new message
+        new_answer = {"role": "assistant", "content": output_content}
+        conversation_messages.append(new_answer)
+    return rs
+async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
+    # Generate a random time interval from the Poisson distribution
+    assert request_rate > 0
+    interval = np.random.exponential(1.0 / request_rate)
+    if verbose:
+        logger.info(f"Sleeping for {interval:.3f} seconds...")
+    await asyncio.sleep(interval)
+async def exponential_backoff_sleep(
+    attempt_cnt: int,
+    base_rate: float = 1.0,
+    backoff_factor: float = 2.0,
+    jitter_fraction: float = 0.10,
+    verbose: bool = False,
+) -> None:
+    # Sleep with exponential backoff and jitter after a failed request.
+    backoff_delay = base_rate * (backoff_factor**attempt_cnt)
+    jittered_delay = backoff_delay * (
+        1 + np.random.uniform(-jitter_fraction, jitter_fraction)
+    )
+    if verbose:
+        logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
+    await asyncio.sleep(jittered_delay)
+async def client_main(
+    args: ClientArgs,
+    req_args: RequestArgs,
+    client_id: int,
+    tokenizer: AutoTokenizer,
+    stop_event: mp.Event,  # type: ignore
+    task_queue: mp.Queue,
+    result_queue: mp.Queue,
+    conv_queue: mp.Queue,
+) -> None:
+    logger.info(
+        f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
+    )
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)
+    # Active conversations
+    active_convs: ConversationsMap = {}
+    conv_id_queue: deque = deque(maxlen=args.max_active_conversations)
+    # Keep track of how many messages have been used for each conversation
+    turns_count: Counter = Counter()
+    num_successes = 0
+    num_failures = 0
+    # Track the timestamp (time.perf_counter())
+    # of the last turn per conversation (only for debug)
+    time_of_last_turn: dict[ConvId, float] = {}
+    # Flag that indicates that there are no new tasks (conversations) for the client
+    task_queue_empty = False
+    async with aiohttp.ClientSession() as session:
+        # Print progress
+        while task_queue_empty is False:
+            result = None
+            if (
+                args.max_num_requests
+                and num_successes + num_failures == args.max_num_requests
+            ):
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} reached "
+                    f"request limit{Color.RESET}"
+                )
+                break
+            if stop_event.is_set():  # type: ignore
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} received "
+                    f"a termination signal{Color.RESET}"
+                )
+                break
+            while (
+                len(active_convs) < args.max_active_conversations
+                and task_queue_empty is False
+            ):
+                # Get a new conversation from the task queue
+                conv_id, messages = task_queue.get()
+                if conv_id is TERM_SIGNAL:
+                    task_queue_empty = True
+                    break
+                if args.skip_first_turn:
+                    # Skip the first turn (both user and assistant),
+                    # relevant if warmup was enabled.
+                    # Default turns_count[conv_id] will be zero if conv_id
+                    # was never inserted/updated in turns_count.
+                    turns_count[conv_id] += 2
+                if turns_count[conv_id] < len(messages):
+                    # Add new conversation
+                    active_convs[conv_id] = messages
+                    conv_id_queue.append(conv_id)
+                    if args.verbose:
+                        logger.info(
+                            f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}"  # noqa: E501
+                        )
+                elif args.verbose:
+                    # No more messages (conversation finished during the warmup)
+                    logger.info(
+                        f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}"  # noqa: E501
+                    )
+            if len(active_convs) == 0 or task_queue_empty:
+                logger.info(
+                    f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}"
+                )
+                break
+            # Pick an active conversation for the next request
+            if args.conversation_sampling == ConversationSampling.ROUND_ROBIN:
+                conv_id = conv_id_queue.pop()
+            else:
+                # ConversationSampling.RANDOM
+                active_ids = list(active_convs.keys())
+                conv_id = random.choice(active_ids)
+            messages = active_convs[conv_id]
+            assert isinstance(messages, list) and len(messages) > 0
+            # Update the amount of messages to use
+            turns_count[conv_id] += 1
+            current_turn = turns_count[conv_id]
+            assert current_turn < len(messages), (
+                f"Turn number {current_turn} is invalid for conversation ID {conv_id}"
+                f" that has only {len(messages)} messages"
+            )
+            if args.verbose:
+                curr_time_sec: float = time.perf_counter()
+                time_since_last_turn: str | float = "N/A"
+                if conv_id in time_of_last_turn:
+                    time_since_last_turn = round(
+                        curr_time_sec - time_of_last_turn[conv_id], 3
+                    )
+                logger.info(
+                    f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})"  # noqa: E501
+                )
+                time_of_last_turn[conv_id] = curr_time_sec
+            success = False
+            for attempt_cnt in range(args.max_retries + 1):
+                try:
+                    exception = False
+                    result = await send_turn(
+                        session,
+                        client_id,
+                        conv_id,
+                        messages,
+                        current_turn,
+                        tokenizer,
+                        req_args,
+                        args.print_content,
+                        args.verify_output,
+                    )
+                    if result is not None:
+                        result_queue.put(result)
+                        success = True
+                        break
+                    else:
+                        logger.warning(
+                            f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                        )
+                except asyncio.exceptions.TimeoutError:
+                    exception = True
+                    logger.error(
+                        "%sClient %d - Timeout during conversation ID %s (turn: %d). "
+                        "Base timeout is %ss (set with --request-timeout-sec), but the "
+                        "effective timeout may be longer based on max_tokens. If this "
+                        "is unexpected, consider increasing the timeout or checking "
+                        "model performance.%s",
+                        Color.RED,
+                        client_id,
+                        conv_id,
+                        current_turn,
+                        req_args.timeout_sec,
+                        Color.RESET,
+                    )
+                except Exception:
+                    exception = True
+                    logger.exception(
+                        f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}"  # noqa: E501
+                    )
+                # Sleep before retry if not last attempt
+                if not success and attempt_cnt < args.max_retries:
+                    await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
+            if not success:
+                num_failures += 1
+                # Remove the conversation (should not be used again)
+                active_convs.pop(conv_id)
+                if exception:
+                    break  # Exit gracefully instead of raising an error
+            else:
+                num_successes += 1
+                # Update the turns counter to include the LLM response
+                # The LLM response will be used as context for the next user turn
+                turns_count[conv_id] += 1
+                max_turns = len(messages)
+                if args.max_turns is not None:
+                    # Limit the number of turns in the conversation
+                    max_turns = min(args.max_turns, max_turns)
+                if turns_count[conv_id] >= max_turns:
+                    # Conversation has no more turns (no longer active)
+                    # save the updated conversation (with the LLM server's answer)
+                    conv_queue.put((conv_id, active_convs.pop(conv_id)))
+                    if args.verbose:
+                        logger.info(
+                            f"{Color.GREEN}Client {client_id} finished "
+                            f"conversation ID {conv_id}{Color.RESET}"
+                        )
+                else:
+                    # Conversation is not finished, insert it at the back of the queue
+                    conv_id_queue.appendleft(conv_id)
+            # Sleep between requests (if lambda is positive)
+            if args.request_rate > 0:
+                await poisson_sleep(args.request_rate, args.verbose)
+    # Send indication that the client is done
+    conv_queue.put((TERM_SIGNAL, TERM_SIGNAL))
+    logger.info(
+        f"{Color.CYAN}Client {client_id} is done "
+        f"({num_successes=}, {num_failures=}){Color.RESET}"
+    )
+def worker_function(
+    client_id: int,
+    tokenizer: AutoTokenizer,
+    client_args: ClientArgs,
+    req_args: RequestArgs,
+    stop_event: mp.Event,  # type: ignore
+    task_queue: mp.Queue,
+    result_queue: mp.Queue,
+    conv_queue: mp.Queue,
+) -> None:
+    asyncio.run(
+        client_main(
+            client_args,
+            req_args,
+            client_id,
+            tokenizer,
+            stop_event,
+            task_queue,
+            result_queue,
+            conv_queue,
+        )
+    )
+def get_client_config(
+    args: argparse.Namespace, input_conv: ConversationsMap
+) -> tuple[ClientArgs, RequestArgs]:
+    if args.num_clients < 1:
+        raise ValueError("Number of clients must be a positive number")
+    if len(input_conv) < args.num_clients:
+        raise ValueError(
+            "Number of conversations must be equal or larger than the number of clients"
+        )
+    max_req_per_client: int | None = None
+    if args.max_num_requests is not None:
+        # Max number of requests per client
+        req_per_client = args.max_num_requests // args.num_clients
+        if req_per_client < 1:
+            raise ValueError("Number of requests should be at least one per client")
+        max_req_per_client = req_per_client
+    max_active_conversations = args.max_active_conversations
+    if max_active_conversations is None:
+        # Each client will have only one active conversation at a time
+        max_active_conversations = args.num_clients
+    if max_active_conversations > len(input_conv):
+        raise ValueError(
+            f"Max active conversations {max_active_conversations} "
+            "must be equal or less than the total number of conversations"
+        )
+    # Max number of active conversations per client
+    max_active_conv_per_client = max_active_conversations // args.num_clients
+    if max_active_conv_per_client < 1:
+        raise ValueError(
+            f"Max active conversations {max_active_conversations} "
+            "must be equal or greater than the number of clients"
+        )
+    # Skip the first user turn (as part of the warmup)
+    skip_first_turn = args.warmup_step
+    # Common arguments for all clients
+    client_args = ClientArgs(
+        seed=args.seed,
+        max_num_requests=max_req_per_client,
+        skip_first_turn=skip_first_turn,
+        max_turns=args.max_turns,
+        max_active_conversations=max_active_conv_per_client,
+        verbose=args.verbose,
+        print_content=args.print_content,
+        verify_output=args.verify_output,
+        conversation_sampling=args.conversation_sampling,
+        request_rate=args.request_rate,
+        max_retries=args.max_retries,
+    )
+    if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
+        if args.limit_min_tokens < 1 or args.limit_max_tokens < 1:
+            raise ValueError(
+                "Invalid min/max tokens limits (both limits should be provided)"
+            )
+        if args.limit_min_tokens > args.limit_max_tokens:
+            raise ValueError(
+                "Invalid min/max tokens limits (min should not be larger than max)"
+            )
+    if args.request_timeout_sec <= 0:
+        raise ValueError("Request timeout must be a positive number")
+    # Arguments for API requests
+    chat_url = f"{args.url}/v1/chat/completions"
+    model_name = args.served_model_name if args.served_model_name else args.model
+    req_args = RequestArgs(
+        chat_url=chat_url,
+        model=model_name,
+        stream=not args.no_stream,
+        limit_min_tokens=args.limit_min_tokens,
+        limit_max_tokens=args.limit_max_tokens,
+        timeout_sec=args.request_timeout_sec,
+    )
+    return client_args, req_args
+async def main_mp(
+    client_args: ClientArgs,
+    req_args: RequestArgs,
+    bench_args: BenchmarkArgs,
+    tokenizer: AutoTokenizer,
+    input_conv: ConversationsMap,
+) -> tuple[ConversationsMap, list[RequestStats]]:
+    # An event that will trigger graceful termination of all the clients
+    stop_event = mp.Event()
+    # Queue for input conversations (from the input file/dataset)
+    task_queue: mp.Queue = mp.Queue()
+    # Queue for client measurements (TTFT, TPOT, etc. for each request)
+    result_queue: mp.Queue = mp.Queue()
+    # Queue for output conversations (with the LLM answers, sent by the server)
+    conv_queue: mp.Queue = mp.Queue()
+    output_conv: ConversationsMap = {}
+    client_metrics: list[RequestStats] = []
+    # Start all clients
+    start_time = time.perf_counter_ns()
+    logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}")
+    clients = []
+    for client_id in range(bench_args.num_clients):
+        client = mp.Process(
+            name=f"client_{client_id}",
+            target=worker_function,
+            args=(
+                client_id,
+                tokenizer,
+                client_args,
+                req_args,
+                stop_event,
+                task_queue,
+                result_queue,
+                conv_queue,
+            ),
+        )
+        clients.append(client)
+        client.start()
+    # Submit all the input conversations as tasks for the clients
+    for conv_id, messages in input_conv.items():
+        task_queue.put((conv_id, messages))
+    # Add termination signals for clients
+    for _ in range(bench_args.num_clients):
+        task_queue.put((TERM_SIGNAL, TERM_SIGNAL))
+    # Collect the updated conversations from all clients
+    num_clients_finished = 0
+    total_convs = len(input_conv)
+    debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500))
+    while num_clients_finished < bench_args.num_clients:
+        # Collect updated conversation
+        conv_id, messages = conv_queue.get()
+        # Collect results (measurements)
+        while not result_queue.empty():
+            new_data = result_queue.get()
+            client_metrics.append(new_data)
+            debug_stats.update(new_data)
+        if conv_id is TERM_SIGNAL:
+            num_clients_finished += 1
+            logger.info(
+                f"{Color.CYAN}{num_clients_finished} out of "
+                f"{bench_args.num_clients} clients finished{Color.RESET}"
+            )
+            if bench_args.early_stop and not stop_event.is_set():
+                # Once one client finished, stop all other clients.
+                # there is no reason to continue the benchmark with fewer clients.
+                logger.info(
+                    f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}"
+                )
+                stop_event.set()
+        else:
+            output_conv[conv_id] = messages
+            finished_convs = len(output_conv)
+            percent = finished_convs / total_convs
+            # Tuned to control the print rate (can be changed if required)
+            print_cycle = max(3, int(bench_args.num_clients / 4))
+            if finished_convs % print_cycle == 0:
+                runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time)
+                logger.info(
+                    f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), "  # noqa: E501
+                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
+                )
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
+                if len(client_metrics) < (5 * bench_args.num_clients):
+                    # Do not estimate the RPS if the number of samples is very low
+                    # (threshold can be tuned if needed)
+                    rps = "N/A"
+                runtime_left_sec: str | float = round(
+                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
+                )
+                if percent < 0.05:
+                    # If less than 5% of the conversations were not finished,
+                    # the estimation will probably be very inaccurate
+                    # (threshold can be tuned if needed).
+                    runtime_left_sec = "N/A"
+                logger.info(
+                    f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}"  # noqa: E501
+                )
+                debug_stats.print()
+    logger.info(
+        f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}"
+    )
+    # At this point all the clients finished,
+    # collect results (TTFT, TPOT, etc.) from all the clients.
+    # This needs to happen before calling join on the clients
+    # (result_queue should be emptied).
+    while not result_queue.empty():
+        client_metrics.append(result_queue.get())
+    logger.info(f"Collected {len(client_metrics)} samples from all the clients")
+    # Wait for all clients to finish
+    for client in clients:
+        logger.info(
+            f"{Color.CYAN}Waiting for client {client.name} "
+            f"(is alive: {client.is_alive()}){Color.RESET}"
+        )
+        client.join(timeout=req_args.timeout_sec + 1)
+        if client.is_alive():
+            logger.warning(
+                f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}"
+            )
+            client.terminate()
+        exitcode = client.exitcode
+        if exitcode != 0:
+            logger.error(
+                f"{Color.RED}Client {client.name} exited "
+                f"with exit code {exitcode}{Color.RESET}"
+            )
+    logger.info(
+        f"All {bench_args.num_clients} clients exited (successfully "
+        f"finished {len(output_conv)} out of {total_convs} conversations)"
+    )
+    # Queues should be closed, required to avoid hang at interpreter shutdown
+    unfinished_tasks = 0
+    while not task_queue.empty():
+        task_queue.get()
+        unfinished_tasks += 1
+    if unfinished_tasks > 0:
+        # Can happen if not all tasks (conversations) have finished.
+        # May happen if --max-num-requests was used,
+        # or if an error occurred in one of the clients.
+        logger.debug(f"Discarding {unfinished_tasks} unfinished tasks")
+    task_queue.close()
+    task_queue.join_thread()
+    result_queue.close()
+    result_queue.join_thread()
+    conv_queue.close()
+    conv_queue.join_thread()
+    return output_conv, client_metrics
+def get_filename_with_timestamp(label: str, extension: str) -> str:
+    time_now = datetime.now()
+    timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S")
+    filename = f"{label}__{timestamp}.{extension}"
+    return filename
+def process_statistics(
+    client_metrics: list[RequestStats],
+    warmup_percentages: list[float],
+    test_params: dict,
+    verbose: bool,
+    gen_conv_args: GenConvArgs | None = None,
+    excel_output: bool = False,
+    warmup_runtime_sec: float | None = None,
+) -> None:
+    if len(client_metrics) == 0:
+        logger.info("No samples to process")
+        return
+    logger.info(f"Processing {len(client_metrics)} samples...")
+    raw_data = pd.DataFrame(client_metrics)
+    if verbose:
+        # Calculate the time between user turns in each conversation (in a new column)
+        raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"])
+        raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[
+            "start_time_ms"
+        ].diff()
+        # Convert milliseconds to seconds
+        raw_data["time_between_user_turns_sec"] = (
+            raw_data["time_between_user_turns_sec"] / 1000.0
+        )
+    # Final raw data should be sorted by time
+    raw_data = raw_data.sort_values(by=["start_time_ms"])
+    raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"]
+    percentiles = [0.25, 0.5, 0.75, 0.9]
+    # Add more percentiles if there are enough samples
+    if len(raw_data) >= 100:
+        percentiles.append(0.99)
+    if len(raw_data) >= 1000:
+        percentiles.append(0.999)
+    if len(raw_data) >= 10000:
+        percentiles.append(0.9999)
+    # Set precision for numbers in the output text (the dataframes)
+    pd.set_option("display.precision", 2)
+    # Exclude parameters from RequestStats
+    exclude = [
+        "start_time_ms",
+        "end_time_ms",
+        "output_num_first_chunk_tokens",
+        "approx_cached_percent",
+        "conversation_id",
+        "client_id",
+    ]
+    print(TEXT_SEPARATOR)
+    print(f"{Color.YELLOW}Parameters:{Color.RESET}")
+    for k, v in test_params.items():
+        print(f"{k}={v}")
+    # conversations generation parameters
+    if gen_conv_args is not None:
+        gen_params = {
+            "text_files": ", ".join(gen_conv_args.text_files),
+            "input_num_turns": str(gen_conv_args.input_num_turns),
+            "input_common_prefix_num_tokens": str(
+                gen_conv_args.input_common_prefix_num_tokens
+            ),
+            "input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens),
+            "input_num_tokens": str(gen_conv_args.input_num_tokens),
+            "output_num_tokens": str(gen_conv_args.output_num_tokens),
+        }
+        print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}")
+        for k, v in gen_params.items():
+            print(f"{k}={v}")
+    print(TEXT_SEPARATOR)
+    params_list = []
+    df_list = []
+    for percent in warmup_percentages:
+        # Select samples from the end (tail) of the dataframe
+        warmup_count = int(percent * len(raw_data))
+        tail_count = len(raw_data) - warmup_count
+        if tail_count == 0:
+            # No reason to process if the count of samples is zero
+            break
+        df = raw_data.tail(tail_count)
+        # Runtime is the diff between the end of the last request
+        # and the start of the first request
+        runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0]
+        # Convert milliseconds to seconds
+        runtime_sec = runtime_sec / 1000.0
+        requests_per_sec = float(len(df)) / runtime_sec
+        params = {
+            "runtime_sec": runtime_sec,
+            "requests_per_sec": requests_per_sec,
+        }
+        if warmup_runtime_sec is not None:
+            params["warmup_runtime_sec"] = warmup_runtime_sec
+            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
+        # Generate a summary of relevant metrics (and drop irrelevant data)
+        df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
+        # List for Excel file
+        params_list.append(params)
+        df_list.append(df)
+        # Print the statistics summary
+        if percent > 0 or len(warmup_percentages) > 1:
+            print(
+                f"{Color.YELLOW}Statistics summary "
+                f"(assuming {percent:.0%} warmup samples):{Color.RESET}"
+            )
+        else:
+            print(f"{Color.YELLOW}Statistics summary:{Color.RESET}")
+        for k, v in params.items():
+            if isinstance(v, float):
+                print(f"{k} = {v:.3f}")
+            else:
+                print(f"{k} = {v}")
+        print(TEXT_SEPARATOR)
+        print(df)
+        print(TEXT_SEPARATOR)
+    if excel_output:
+        prefix = f"statistics_{test_params['num_clients']}_clients"
+        filename = get_filename_with_timestamp(prefix, "xlsx")
+        with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
+            startrow = 0
+            test_params_df = pd.DataFrame([test_params])
+            test_params_df.to_excel(
+                writer, sheet_name="Summary", index=False, startrow=startrow
+            )
+            startrow += len(test_params_df) + 3
+            if gen_conv_args is not None:
+                gen_params_df = pd.DataFrame([gen_params])
+                gen_params_df.to_excel(
+                    writer, sheet_name="Summary", index=False, startrow=(startrow - 1)
+                )
+                startrow += len(gen_params_df) + 3
+            for params, df_stats in zip(params_list, df_list):
+                df_params = pd.DataFrame([params])
+                df_params.to_excel(
+                    writer, sheet_name="Summary", index=False, startrow=startrow
+                )
+                startrow += len(df_params) + 2
+                df_stats.to_excel(
+                    writer, sheet_name="Summary", index=True, startrow=startrow
+                )
+                startrow += len(df_stats) + 3
+            raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0)
+        logger.info(
+            f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}"
+        )
+async def get_server_info(url: str) -> None:
+    logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}")
+    async with aiohttp.ClientSession() as session:
+        # Get server version (not mandatory, "version" endpoint may not exist)
+        url_version = f"{url}/version"
+        async with session.get(url_version) as response:
+            if HTTPStatus(response.status) == HTTPStatus.OK:
+                text = await response.text()
+                logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}")
+        # Get available models
+        url_models = f"{url}/v1/models"
+        async with session.get(url_models) as response:
+            if HTTPStatus(response.status) == HTTPStatus.OK:
+                text = await response.text()
+                logger.info(f"{Color.BLUE}Models:{Color.RESET}")
+                models_data = json.loads(text)
+                models_list = models_data["data"]
+                for model in models_list:
+                    model_id = model["id"]
+                    max_model_len = model.get("max_model_len", "N/A")
+                    logger.info(
+                        f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}"
+                    )
+            else:
+                logger.info(f"{Color.RED}Failed to get models{Color.RESET}")
+async def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="Benchmark serving with multi-turn conversations",
+        description="Benchmark online inference using REST API",
+    )
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        type=str,
+        required=True,
+        help="Input JSON file with ShareGPT conversations or "
+        "configuration file for generation of synthetic conversations",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        type=str,
+        default=None,
+        help="Output JSON file containing conversations with updated assistant answers",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for random number generators (default: 0)",
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Path of the LLM model"
+    )
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the `--model` argument. ",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        default="http://localhost:8000",
+        help="Base URL for the LLM API server",
+    )
+    parser.add_argument(
+        "-p",
+        "--num-clients",
+        type=int,
+        default=1,
+        help="Number of clients that will send requests in parallel",
+    )
+    parser.add_argument(
+        "-k",
+        "--max-active-conversations",
+        type=int,
+        default=None,
+        help="Max number of active conversations at a time (for all clients)",
+    )
+    parser.add_argument(
+        "-n",
+        "--max-num-requests",
+        type=int,
+        default=None,
+        help="Max number of requests to send (total for all clients)",
+    )
+    parser.add_argument(
+        "--warmup-step",
+        default=False,
+        action="store_true",
+        help="Run a warmup step (using only the first turn of every conversation), "
+        "measurements will not be included in the final benchmark results",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=None,
+        help="Maximum number of turns/messages per conversation, "
+        "includes both user and assistant messages "
+        "(a positive number, e.g: 2, 4, 6, etc.), disabled by default",
+    )
+    parser.add_argument(
+        "--no-early-stop",
+        default=False,
+        action="store_true",
+        help="By default, the benchmark will stop if at least one client exits."
+        " Use this flag to disable this behavior",
+    )
+    parser.add_argument(
+        "--limit-max-tokens",
+        type=int,
+        default=NUM_TOKENS_FROM_DATASET,
+        help="Set max_tokens for the output token count of each request "
+        "(must also set --limit-min-tokens). "
+        "Overrides output token count from the input dataset. "
+        "Use a negative value to disable this limit.",
+    )
+    parser.add_argument(
+        "--limit-min-tokens",
+        type=int,
+        default=NUM_TOKENS_FROM_DATASET,
+        help="Set min_tokens for the output token count of each request "
+        "(must also set --limit-max-tokens). "
+        "Overrides output token count from the input dataset. "
+        "Use a negative value to disable this limit.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=0,
+        help="Expected request rate (Poisson process) per client in requests/sec."
+        "Set to 0 for no delay between requests.",
+    )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
+        help="Maximum number of retry attempts for timed-out requests. "
+        "Default is 0 (no retries). "
+        "Set to higher values to retry failed requests and maintain "
+        "fair workload distribution. "
+        "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
+    )
+    parser.add_argument(
+        "--conversation-sampling",
+        type=ConversationSampling,
+        choices=list(ConversationSampling),
+        default=ConversationSampling.ROUND_ROBIN,
+        help=(
+            "Strategy for selecting which conversation to use for the next request. "
+            "Options: 'round_robin' (cycle through conversations), "
+            "'random' (pick randomly)."
+        ),
+    )
+    parser.add_argument(
+        "--verify-output",
+        default=False,
+        action="store_true",
+        help="Verify the LLM output (compare to the answers in the input JSON file)",
+    )
+    parser.add_argument(
+        "--request-timeout-sec",
+        type=int,
+        default=120,
+        help="Timeout in seconds for each API request (default: 120). "
+        "Automatically increased if max tokens imply longer decoding.",
+    )
+    parser.add_argument(
+        "--no-stream",
+        default=False,
+        action="store_true",
+        help="Disable stream/streaming mode (set 'stream' to False in the API request)",
+    )
+    parser.add_argument(
+        "-e",
+        "--excel-output",
+        default=False,
+        action="store_true",
+        help="Export summary to Excel file (optional)",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "--print-content",
+        default=False,
+        action="store_true",
+        help="Print the user prompts and the server's answers",
+    )
+    parser.add_argument(
+        "--warmup-percentages",
+        type=str,
+        default="0%",
+        help="Ignore the first X samples as warmup (X is a percentage)."
+        " A comma separated list of percentages can be used "
+        "(for example: --warmup-percentages=0%%,50%%)",
+    )
+    args = parser.parse_args()
+    logger.info(args)
+    logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}")
+    logger.info(f"url={args.url}")
+    logger.info(f"model={args.model}")
+    logger.info(f"num_clients={args.num_clients}")
+    if args.verify_output:
+        logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}")
+    # Calculate the amount of samples to filter (as warmup samples/measurements).
+    try:
+        warmup_percentages: list[float] = [0.0]
+        if not args.warmup_step:
+            # Warmup percentage can be used only if the warmup step was used
+            warmup_strings: list[str] = args.warmup_percentages.split(",")
+            warmup_strings = [x.replace("%", "") for x in warmup_strings]
+            warmup_percentages = [float(x) / 100 for x in warmup_strings]
+            # Check for valid range (0 to 1)
+            for p in warmup_percentages:
+                assert p >= 0.0 and p < 1.0
+            # Sort from high to low warmup percentage
+            warmup_percentages.sort()
+            logger.info(
+                f"Warmup percentages (percentage of samples): {warmup_percentages}"
+            )
+    except Exception:
+        raise ValueError(
+            f"Invalid --warmup-percentage={args.warmup_percentage}"
+        ) from None
+    # Set global seeds for main process
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logger.info("Loading tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    await get_server_info(args.url)
+    # Load the input file (either conversations of configuration file)
+    logger.info(f"Reading input file: {args.input_file}")
+    with open(args.input_file) as f:
+        input_data = json.load(f)
+    gen_conv_args = None
+    if isinstance(input_data, list):
+        # The conversations are stored as a list of dicts
+        logger.info(f"Found {len(input_data)} items in the input file")
+        # Convert the list to a ConversationsMap
+        conversations = conversations_list_to_dict(input_data)
+    elif isinstance(input_data, dict):
+        # The input file is a configuration file
+        # (type is determined by the field 'filetype')
+        if "filetype" not in input_data:
+            raise Exception(
+                f"Input file {args.input_file} is invalid (missing 'filetype')"
+            )
+        logger.info(f"Using input file with filetype: {input_data['filetype']}")
+        gen_conv_args = parse_input_json_file(input_data)
+        # Disable warning from "huggingface/tokenizers"
+        # (when using python multiprocessing and tokenizers)
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
+        # Generate synthetic conversations
+        conversations = generate_conversations(gen_conv_args, tokenizer)
+    else:
+        raise Exception(f"Input file {args.input_file} is invalid")
+    if args.max_turns is not None:
+        if args.max_turns < 1:
+            raise ValueError("Max turns must be a positive number")
+        logger.info(
+            f"{Color.PURPLE}Max turns per conversation "
+            f"is limited to {args.max_turns}{Color.RESET}"
+        )
+    # Create benchmark configurations
+    client_args, req_args = get_client_config(args, conversations)
+    bench_args = BenchmarkArgs(
+        url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
+    )
+    warmup_runtime_sec: float | None = None
+    # Warm-up step
+    if args.warmup_step:
+        # Only send a single user prompt from every conversation.
+        # max_active_conversations must be 1,
+        # otherwise the clients may exit after sending a single request
+        # (because the task queue is empty).
+        warmup_client_args = client_args._replace(
+            skip_first_turn=False, max_turns=1, max_active_conversations=1
+        )
+        # Early stop should be disabled,
+        # all clients should finish their work before exiting
+        warmup_bench_args = bench_args._replace(early_stop=False)
+        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        warmup_start_ns = time.perf_counter_ns()
+        conversations, _ = await main_mp(
+            warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
+        )
+        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.PURPLE,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
+    # Run the benchmark
+    benchmark_start_ns = time.perf_counter_ns()
+    client_convs, client_metrics = await main_mp(
+        client_args, req_args, bench_args, tokenizer, conversations
+    )
+    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
+    # Calculate requests per second
+    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
+    logger.info(
+        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        "requests per second: %.3f%s",
+        Color.GREEN,
+        benchmark_runtime_sec,
+        benchmark_runtime_ms,
+        requests_per_sec,
+        Color.RESET,
+    )
+    if warmup_runtime_sec is not None:
+        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info(
+            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            total_runtime_sec,
+            total_runtime_sec * 1000,
+            Color.RESET,
+        )
+    # Benchmark parameters
+    params = {
+        "model": args.model,
+        "num_clients": args.num_clients,
+        "num_conversations": len(conversations),
+        "active_conversations": args.max_active_conversations,
+        "seed": args.seed,
+    }
+    if args.limit_min_tokens > 0:
+        params["min_tokens"] = args.limit_min_tokens
+    if args.limit_max_tokens > 0:
+        params["max_tokens"] = args.limit_max_tokens
+    # Process and print statistics (and save excel file with the statistics)
+    process_statistics(
+        client_metrics,
+        test_params=params,
+        warmup_percentages=warmup_percentages,
+        verbose=args.verbose,
+        gen_conv_args=gen_conv_args,
+        excel_output=args.excel_output,
+        warmup_runtime_sec=warmup_runtime_sec,
+    )
+    if args.output_file is not None:
+        # Write a JSON file with the updated conversations
+        # The "assistant" content will contain the answers from the tested LLM
+        output_data: ShareGptConversations = conversations_dict_to_list(client_convs)
+        logger.info(
+            f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}"
+        )
+        with open(args.output_file, "w") as f:
+            json.dump(output_data, f, indent=4)
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Download dataset from:
+https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
+Convert to OpenAI API:
+export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
+python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
+"""
+import argparse
+import json
+import random
+from statistics import mean
+from typing import Any
+import pandas as pd  # type: ignore
+import tqdm  # type: ignore
+from transformers import AutoTokenizer  # type: ignore
+def has_non_english_chars(text: str) -> bool:
+    return not text.isascii()
+def content_is_valid(
+    content: str, min_content_len: int | None, max_content_len: int | None
+) -> bool:
+    if min_content_len and len(content) < min_content_len:
+        return False
+    if max_content_len and len(content) > max_content_len:
+        return False
+    return has_non_english_chars(content)
+def print_stats(
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
+) -> None:
+    # Collect statistics
+    stats = []
+    print("\nCollecting statistics...")
+    for item in tqdm.tqdm(conversations):
+        # item has "id" and "messages"
+        messages = item["messages"]
+        user_turns = 0
+        assistant_turns = 0
+        user_words = 0
+        assistant_words = 0
+        conv_chars = 0
+        user_tokens: list[int] = []
+        assistant_tokens: list[int] = []
+        for m in messages:
+            content = m["content"]
+            conv_chars += len(content)
+            content_num_words = content.count(" ") + 1
+            num_tokens = 0
+            if tokenizer:
+                num_tokens = len(tokenizer(m["content"]).input_ids)
+            if m["role"] == "user":
+                user_turns += 1
+                user_words += content_num_words
+                if tokenizer:
+                    user_tokens.append(num_tokens)
+            elif m["role"] == "assistant":
+                assistant_turns += 1
+                assistant_words += content_num_words
+                if tokenizer:
+                    assistant_tokens.append(num_tokens)
+        # assert user_turns == assistant_turns, \
+        # f"Invalid conversation ID {item['id']}"
+        conv_words = user_words + assistant_words
+        item_stats = {
+            "user_turns": user_turns,
+            "assistant_turns": assistant_turns,
+            "user_words": user_words,
+            "assistant_words": assistant_words,
+            "conv_turns": len(messages),
+            "conv_words": conv_words,
+            "conv_characters": conv_chars,
+        }
+        if len(user_tokens) > 0:
+            item_stats["user_tokens"] = int(mean(user_tokens))
+        if len(assistant_tokens) > 0:
+            item_stats["assistant_tokens"] = int(mean(assistant_tokens))
+        stats.append(item_stats)
+    print("\nStatistics:")
+    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
+    df = pd.DataFrame(stats)
+    print(df.describe(percentiles=percentiles).transpose())
+def convert_sharegpt_to_openai(
+    seed: int,
+    input_file: str,
+    output_file: str,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
+) -> None:
+    if min_turns and max_turns:
+        assert min_turns <= max_turns
+    if min_content_len and max_content_len:
+        # Verify that min is not larger than max if both were given
+        assert min_content_len <= max_content_len
+    print(
+        f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
+        f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
+    )
+    random.seed(seed)
+    tokenizer = None
+    if model is not None:
+        print(f"Loading tokenizer from: {model}")
+        tokenizer = AutoTokenizer.from_pretrained(model)
+    # Read the ShareGPT JSON file
+    print(f"Reading file: {input_file}")
+    with open(input_file, encoding="utf-8") as f:
+        # Should be a list of dicts
+        # Each dict should have "id" (string) and "conversations" (list of dicts)
+        sharegpt_data = json.load(f)
+    assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
+    print(f"Total items in input file: {len(sharegpt_data):,}")
+    print(f"Shuffling dataset with seed {seed}")
+    random.shuffle(sharegpt_data)
+    # Map conversation ID to the all the messages
+    conversation_parts: dict[str, list[Any]] = {}
+    for item in tqdm.tqdm(sharegpt_data):
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+        # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
+        conv_id, _ = item["id"].split("_")
+        new_turns = item["conversations"]
+        if conv_id not in conversation_parts:
+            # Start new conversation
+            conversation_parts[conv_id] = []
+        elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
+            prev_turns = conversation_parts[conv_id][-1]
+            if prev_turns[-1]["from"] == new_turns[0]["from"]:
+                new_turns = new_turns[1:]
+        if len(new_turns) > 0:
+            # We assume that parts are in order in the ShareGPT dataset
+            conversation_parts[conv_id].append(new_turns)
+    dataset: list[dict[str, Any]] = []
+    for conv_id, conv_parts in conversation_parts.items():
+        new_item = {"id": conv_id}
+        conversations: list[dict[str, str]] = []
+        # Merge all parts
+        for conv_part in conv_parts:
+            conversations.extend(conv_part)
+        if len(conversations) > 0:
+            new_item["conversations"] = conversations
+            dataset.append(new_item)
+    print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
+    # Final output data
+    final_openai_dataset: list[dict] = []
+    # Filter conversations from the ShareGPT dataset and convert to OpenAI format
+    for item in tqdm.tqdm(dataset):
+        messages: list[dict] = []
+        assert "id" in item, "Missing key 'id'"
+        assert "conversations" in item, "Missing key 'conversations'"
+        conv_id = item["id"]
+        conversations = item["conversations"]
+        if min_turns is not None and len(conversations) < min_turns:
+            # Skip short conversations
+            continue
+        # Convert each message in the conversation, up to max_turns if specified
+        for i, turn in enumerate(conversations):
+            assert "from" in turn and "value" in turn, (
+                f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
+            )
+            role = None
+            turn_from = turn["from"]
+            if turn_from in {"human", "user"}:
+                role = "user"
+            elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
+                role = "assistant"
+            elif turn_from == "system":
+                role = "system"
+            assert role is not None, (
+                f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
+            )
+            if i == 0 and role != "user":
+                # If the first message is from assistant (gpt), skip it.
+                # this happens when the conversation is a follow-up
+                # to a previous conversation (from the same user).
+                continue
+            if max_turns is not None and i >= max_turns:
+                break
+            # Convert message to OpenAI format (with "role" and "content")
+            content = turn["value"]
+            messages.append({"role": role, "content": content})
+        # Add the converted conversation to the OpenAI format
+        if len(messages) > 0:
+            valid_messages = True
+            # First turn should always be from the user
+            user_turn = True
+            for m in messages:
+                # Make sure that turns alternate between user and assistant
+                if (user_turn and m["role"] != "user") or (
+                    not user_turn and m["role"] != "assistant"
+                ):
+                    valid_messages = False
+                    break
+                user_turn = not user_turn
+                content = m["content"]
+                valid_messages = content_is_valid(
+                    content, min_content_len, max_content_len
+                )
+                if not valid_messages:
+                    break
+            if valid_messages is True:
+                final_openai_dataset.append({"id": conv_id, "messages": messages})
+    assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
+    print_stats(final_openai_dataset)
+    print_stats_again = False
+    if max_items is not None and len(final_openai_dataset) > max_items:
+        print(f"\n\nSampling {max_items} items from the dataset...")
+        print_stats_again = True
+        final_openai_dataset = random.sample(final_openai_dataset, max_items)
+    if print_stats_again:
+        # Print stats after the dataset changed
+        print_stats(final_openai_dataset, tokenizer)
+    # Write the converted data to a new JSON file
+    final_size = len(final_openai_dataset)
+    print(f"\nTotal conversations converted (after filtering): {final_size:,}")
+    print(f"\nWriting file: {output_file}")
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert ShareGPT dataset to OpenAI API format"
+    )
+    parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
+    parser.add_argument(
+        "output_file", help="Path to the output OpenAI format JSON file"
+    )
+    parser.add_argument(
+        "--seed", type=int, default=0, help="Seed for random number generators"
+    )
+    parser.add_argument(
+        "--max-items",
+        type=int,
+        default=None,
+        help="Maximum number of items in the output file",
+    )
+    parser.add_argument(
+        "--min-turns",
+        type=int,
+        default=None,
+        help="Minimum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--max-turns",
+        type=int,
+        default=None,
+        help="Maximum number of turns per conversation",
+    )
+    parser.add_argument(
+        "--min-content-len",
+        type=int,
+        default=None,
+        help="Min number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--max-content-len",
+        type=int,
+        default=None,
+        help="Max number of characters in the messages' content",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help="LLM model, only the tokenizer will be used",
+    )
+    args = parser.parse_args()
+    convert_sharegpt_to_openai(
+        args.seed,
+        args.input_file,
+        args.output_file,
+        args.max_items,
+        args.min_content_len,
+        args.max_content_len,
+        args.min_turns,
+        args.max_turns,
+        args.model,
+    )
+if __name__ == "__main__":
+    main()
--- a/benchmarks/multi_turn/requirements.txt
+++ b/benchmarks/multi_turn/requirements.txt
+numpy>=1.24
+pandas>=2.0.0
+aiohttp>=3.10
+transformers>=4.46
+xlsxwriter>=3.2.1
+tqdm>=4.66
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import cProfile
+import pstats
+from vllm import LLM, SamplingParams
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
+LONG_PROMPT = " ".join(LONG_PROMPT)
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx(
+            "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals()
+        )
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if "hash_of_block" in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(
+        f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime."
+    )
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of hashing function in"
+        "automatic prefix caching."
+    )
+    parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k")
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--enable-prefix-caching", action="store_true", help="enable prefix caching"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
+#!/bin/bash
+# default values
+MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"}
+BACKEND=${BACKEND:-"vllm"}
+DATASET=${DATASET:-"xgrammar_bench"}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+PORT=${PORT:-8000}
+STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1}
+TOTAL_SECONDS=${TOTAL_SECONDS:-90}
+MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300}
+TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"}
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --model MODEL                  Model to benchmark (default: $MODEL)"
+    echo "  --backend BACKEND              Backend to use (default: $BACKEND)" 
+    echo "  --dataset DATASET              Dataset to use (default: $DATASET)"
+    echo "  --max-new-tokens N             Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)"
+    echo "  --output-dir DIR               Output directory for results (default: $OUTPUT_DIR)"
+    echo "  --port PORT                    Port to use (default: $PORT)"
+    echo "  --structured-output-ratio N    Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)"
+    echo "  --tokenizer-mode MODE          Tokenizer mode to use (default: $TOKENIZER_MODE)"
+    echo "  --total-seconds N              Total seconds to run the benchmark (default: $TOTAL_SECONDS)"
+    echo "  -h, --help                     Show this help message and exit"
+    exit 0
+}
+# parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --model)
+      MODEL="$2"
+      shift 2
+      ;;
+    --backend)
+      BACKEND="$2"
+      shift 2
+      ;;
+    --dataset)
+      DATASET="$2"
+      shift 2
+      ;;
+    --max-new-tokens)
+      MAX_NEW_TOKENS="$2"
+      shift 2
+      ;;
+    --output-dir)
+      OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    --port)
+      PORT="$2"
+      shift 2
+      ;;
+    --structured-output-ratio)
+      STRUCTURED_OUTPUT_RATIO="$2"
+      shift 2
+      ;;
+    --tokenizer-mode)
+      TOKENIZER_MODE="$2"
+      shift 2
+      ;;
+    --total-seconds)
+      TOTAL_SECONDS="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      printf "Unknown argument: %s\n" "$1"
+      usage
+      ;;
+  esac
+done
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+# Define QPS values to test
+QPS_VALUES=(25 20 15 10 5 1)
+# Common parameters
+COMMON_PARAMS=(
+  --backend "$BACKEND"
+  --model "$MODEL"
+  --dataset "$DATASET"
+  --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
+  --save-results
+  --result-dir "$OUTPUT_DIR"
+  --output-len "$MAX_NEW_TOKENS"
+  --port "$PORT"
+  --tokenizer-mode "$TOKENIZER_MODE"
+)
+echo "Starting structured output benchmark with model: $MODEL"
+echo "Backend: $BACKEND"
+echo "Dataset: $DATASET"
+echo "Results will be saved to: $OUTPUT_DIR"
+echo "----------------------------------------"
+# Run benchmarks with different QPS values
+for qps in "${QPS_VALUES[@]}"; do
+  echo "Running benchmark with QPS: $qps"
+  # Get git hash and branch for the filename
+  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+  # Construct filename for this run
+  FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
+  NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
+  NUM_PROMPTS=${NUM_PROMPTS%.*}  # Remove fractional part
+  echo "Running benchmark with $NUM_PROMPTS prompts"
+  # Run the benchmark
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
+    --request-rate "$qps" \
+    --result-filename "$FILENAME" \
+    --num-prompts "$NUM_PROMPTS"
+  echo "Completed benchmark with QPS: $qps"
+  echo "----------------------------------------"
+done
+echo "All benchmarks completed!"
+echo "Results saved to: $OUTPUT_DIR"
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
\ No newline at end of file
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
+include(FetchContent)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+#
+# Define environment variables for special configurations
+#
+set(ENABLE_X86_ISA $ENV{VLLM_CPU_X86})
+set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
+include_directories("${CMAKE_SOURCE_DIR}/csrc")
+set (ENABLE_NUMA TRUE)
+#
+# Check the compile flags
+#
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
+endif()
+function (find_isa CPUINFO TARGET OUT)
+    string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
+    if(NOT ISA_FOUND EQUAL -1)
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+function(check_sysctl TARGET OUT)
+    execute_process(COMMAND sysctl -n "${TARGET}"
+                    RESULT_VARIABLE SYSCTL_RET
+                    OUTPUT_VARIABLE SYSCTL_INFO
+                    ERROR_QUIET
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(SYSCTL_RET EQUAL 0 AND
+      (SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0))
+        set(${OUT} ON PARENT_SCOPE)
+    else()
+        set(${OUT} OFF PARENT_SCOPE)
+    endif()
+endfunction()
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    message(STATUS "Apple Silicon Detected")
+    set(APPLE_SILICON_FOUND TRUE)
+    set(ENABLE_NUMA OFF)
+    check_sysctl(hw.optional.neon ASIMD_FOUND)
+    check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
+else()
+    find_isa(${CPUINFO} "Power11" POWER11_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+    find_isa(${CPUINFO} "S390" S390_FOUND)
+    find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
+    # Support cross-compilation by allowing override via environment variables
+    if (ENABLE_ARM_BF16)
+        set(ARM_BF16_FOUND ON)
+        message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
+    endif()
+endif()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
+    set(ENABLE_X86_ISA ON)
+    if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3))
+        message(FATAL_ERROR "X86 backend requires gcc/g++ >= 12.3")
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS "-mf16c")
+    list(APPEND CXX_COMPILE_FLAGS_AVX512 ${CXX_COMPILE_FLAGS})
+    list(APPEND CXX_COMPILE_FLAGS_AVX2 ${CXX_COMPILE_FLAGS})
+    list(APPEND CXX_COMPILE_FLAGS_AVX512
+        "-mavx512f"
+        "-mavx512vl"
+        "-mavx512bw"
+        "-mavx512dq"
+        "-mavx512bf16"
+        "-mavx512vnni"
+        "-mamx-bf16"
+        "-mamx-tile")
+    list(APPEND CXX_COMPILE_FLAGS_AVX2
+        "-mavx2")
+elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    message(STATUS "PowerPC detected")
+    if (POWER9_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power9"
+            "-mtune=power9")
+    elseif (POWER10_FOUND OR POWER11_FOUND)
+        list(APPEND CXX_COMPILE_FLAGS
+            "-mvsx"
+            "-mcpu=power10"
+            "-mtune=power10")
+    endif()
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif (S390_FOUND)
+    message(STATUS "S390 detected")
+    # Check for S390 VXE support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvx"
+        "-mzvector"
+        "-march=native"
+        "-mtune=native")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
+    if(RVV_FOUND)
+	    message(FAIL_ERROR "Can't support rvv now.")
+    else()
+        list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
+    endif()
+else()
+    message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
+endif()
+# Build oneDNN for GEMM kernels
+if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
+    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
+    if(ASIMD_FOUND)
+        # Set number of parallel build processes
+        include(ProcessorCount)
+        ProcessorCount(NPROC)
+        if(NOT NPROC)
+            set(NPROC 4)
+        endif()
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+        if (OPEN_MP)
+            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+        endif()
+        # Fetch and populate ACL
+        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
+            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
+        else()
+            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
+            FetchContent_Populate(arm_compute
+                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
+                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
+                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
+                GIT_TAG        v52.6.0
+                GIT_SHALLOW    TRUE
+                GIT_PROGRESS   TRUE
+            )
+            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+            set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
+        endif()
+        # Build ACL with CMake
+        set(_cmake_config_cmd
+             ${CMAKE_COMMAND} -G Ninja -B build 
+            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
+            -DCMAKE_BUILD_TYPE=Release 
+            -DARM_COMPUTE_ARCH=armv8.2-a 
+            -DARM_COMPUTE_ENABLE_ASSERTS=OFF 
+            -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF 
+            -DARM_COMPUTE_ENABLE_OPENMP=ON 
+            -DARM_COMPUTE_ENABLE_WERROR=OFF 
+            -DARM_COMPUTE_BUILD_EXAMPLES=OFF 
+            -DARM_COMPUTE_BUILD_TESTING=OFF)
+        set(_cmake_build_cmd
+            ${CMAKE_COMMAND} --build build -- -j${NPROC}
+        )
+        execute_process(
+            COMMAND ${_cmake_config_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+        )
+        execute_process(
+            COMMAND ${_cmake_build_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+            RESULT_VARIABLE _acl_rc
+        )
+        if(NOT _acl_rc EQUAL 0)
+            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
+        endif()
+        message(STATUS "Arm Compute Library (ACL) built successfully.")
+        # VLLM/oneDNN settings for ACL
+        set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
+        add_compile_definitions(VLLM_USE_ACL)
+    endif()
+    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+        FetchContent_Declare(
+            oneDNN
+            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+        )
+    else()
+        message(STATUS "Downloading oneDNN from GitHub")
+        FetchContent_Declare(
+            oneDNN
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            GIT_TAG v3.10
+            GIT_PROGRESS TRUE
+            GIT_SHALLOW TRUE
+        )
+    endif()
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "ON")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "ON")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "ON")
+    set(ONEDNN_VERBOSE "ON")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+    # TODO: Refactor this
+    if (ENABLE_X86_ISA)
+        # Note: only enable oneDNN for AVX512
+        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
+    else()
+        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
+    endif()
+    set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+    set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
+    FetchContent_MakeAvailable(oneDNN)
+    set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
+    add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
+    target_include_directories(
+        dnnl_ext
+        PUBLIC ${oneDNN_SOURCE_DIR}/include
+        PUBLIC ${oneDNN_BINARY_DIR}/include
+        PRIVATE ${oneDNN_SOURCE_DIR}/src
+    )
+    target_link_libraries(dnnl_ext dnnl torch)
+    target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
+    list(APPEND LIBS dnnl_ext)
+    set(USE_ONEDNN ON)
+else()
+    set(USE_ONEDNN OFF)
+endif()
+# TODO: Refactor this
+if (ENABLE_X86_ISA)
+    message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
+    message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
+else()
+    message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+endif()
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()
+#
+# Generate CPU attention dispatch header
+#
+message(STATUS "Generating CPU attention dispatch header")
+execute_process(
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
+    RESULT_VARIABLE GEN_RESULT
+)
+if(NOT GEN_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
+endif()
+#
+# _C extension
+#
+set(VLLM_EXT_SRC
+    "csrc/cpu/activation.cpp"
+    "csrc/cpu/utils.cpp"
+    "csrc/cpu/layernorm.cpp"
+    "csrc/cpu/mla_decode.cpp"
+    "csrc/cpu/pos_encoding.cpp"
+    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
+    "csrc/cpu/cpu_attn.cpp"
+    "csrc/cpu/torch_bindings.cpp")
+if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+if(USE_ONEDNN)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/dnnl_kernels.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+if (ENABLE_X86_ISA)
+    set(VLLM_EXT_SRC_AVX512
+        "csrc/cpu/sgl-kernels/gemm.cpp"
+        "csrc/cpu/sgl-kernels/gemm_int8.cpp"
+        "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
+        "csrc/cpu/sgl-kernels/moe.cpp"
+        "csrc/cpu/sgl-kernels/moe_int8.cpp"
+        "csrc/cpu/sgl-kernels/moe_fp8.cpp"
+        "csrc/cpu/shm.cpp"
+        "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
+        "csrc/cpu/utils.cpp"
+        "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/dnnl_kernels.cpp"
+        "csrc/cpu/torch_bindings.cpp"
+        # TODO: Remove these files
+        "csrc/cpu/activation.cpp"
+        "csrc/cpu/layernorm.cpp"
+        "csrc/cpu/mla_decode.cpp"
+        "csrc/cpu/pos_encoding.cpp"
+        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
+    set(VLLM_EXT_SRC_AVX2 
+        "csrc/cpu/utils.cpp"
+        "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/torch_bindings.cpp"
+        # TODO: Remove these files
+        "csrc/cpu/activation.cpp"
+        "csrc/cpu/layernorm.cpp"
+        "csrc/cpu/mla_decode.cpp"
+        "csrc/cpu/pos_encoding.cpp"
+        "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
+    message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
+    message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
+    define_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX512}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
+        USE_SABI 3
+        WITH_SOABI
+    )
+    # For SGL kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
+    # For AMX kernels
+    target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
+    define_extension_target(
+        _C_AVX2
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC_AVX2}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
+        USE_SABI 3
+        WITH_SOABI
+    )
+else()
+    message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
+    #
+    # Define extension targets
+    #
+    define_extension_target(
+        _C
+        DESTINATION vllm
+        LANGUAGE CXX
+        SOURCES ${VLLM_EXT_SRC}
+        LIBRARIES ${LIBS}
+        COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+        USE_SABI 3
+        WITH_SOABI
+    )
+endif()
+message(STATUS "Enabling C extension.")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
+include(FetchContent)
+# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory 
+# instead of downloading.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{FLASH_MLA_SRC_DIR})
+  set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
+endif()
+if(FLASH_MLA_SRC_DIR)
+  FetchContent_Declare(
+        flashmla 
+        SOURCE_DIR ${FLASH_MLA_SRC_DIR}
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+        flashmla
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
+        GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
+        GIT_PROGRESS TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+endif()
+FetchContent_MakeAvailable(flashmla)
+message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
+# Vendor FlashMLA interface into vLLM with torch-ops shim.
+set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
+file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
+file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
+     FLASHMLA_INTERFACE_CONTENT)
+string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
+               "import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
+               FLASHMLA_INTERFACE_CONTENT
+               "${FLASHMLA_INTERFACE_CONTENT}")
+file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+     "${FLASHMLA_INTERFACE_CONTENT}")
+# Install the generated flash_mla_interface.py to the wheel
+# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
+install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
+        DESTINATION vllm/third_party/flashmla/
+        COMPONENT _flashmla_C)
+# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
+# Only build FlashMLA kernels if we are building for something compatible with 
+# sm90a
+set(SUPPORT_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
+endif()
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
+if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
+    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
+    set(FlashMLA_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
+        # Misc kernels for decoding
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
+        ${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
+        # sm90 dense decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
+        # sm90 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
+        # sm90 sparse prefill
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
+        # sm100 dense prefill & backward
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
+        # sm100 sparse prefill
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
+        # sm100 sparse decode
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
+        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
+    )
+    set(FlashMLA_Extension_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
+    )
+    set(FlashMLA_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/kerutils/include
+        ${flashmla_SOURCE_DIR}/csrc/sm90
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )
+    set(FlashMLA_Extension_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
+    )
+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_Extension_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+    define_extension_target(
+        _flashmla_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_SOURCES}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    # Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
+    target_compile_options(_flashmla_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
+        $<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
+    define_extension_target(
+        _flashmla_extension_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_Extension_SOURCES}
+        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
+    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
+    target_compile_options(_flashmla_extension_C PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
+        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+else()
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
+    add_custom_target(_flashmla_C)
+    add_custom_target(_flashmla_extension_C)
+endif()
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
+include(FetchContent)
+set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
+if(DEFINED ENV{QUTLASS_SRC_DIR})
+  set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
+endif()
+if(QUTLASS_SRC_DIR)
+  FetchContent_Declare(
+    qutlass
+    SOURCE_DIR ${QUTLASS_SRC_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+    qutlass
+    GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
+    GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
+    GIT_PROGRESS TRUE
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+  )
+endif()
+FetchContent_Populate(qutlass)
+if(NOT qutlass_SOURCE_DIR)
+  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
+endif()
+message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
+endif()
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)
+  if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
+    set(QUTLASS_TARGET_CC 100)
+  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
+    set(QUTLASS_TARGET_CC 120)
+  else()
+    message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
+  endif()
+  set(QUTLASS_SOURCES
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
+  )
+  set(QUTLASS_INCLUDES
+    ${qutlass_SOURCE_DIR}
+    ${qutlass_SOURCE_DIR}/qutlass
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include
+    ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
+  )
+  if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
+  elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
+    list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
+    message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
+  else()
+    message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
+                        "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
+  endif()
+  set_gencode_flags_for_srcs(
+    SRCS "${QUTLASS_SOURCES}"
+    CUDA_ARCHS "${QUTLASS_ARCHS}"
+  )
+  target_sources(_C PRIVATE ${QUTLASS_SOURCES})
+  target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
+  target_compile_definitions(_C PRIVATE
+    QUTLASS_DISABLE_PYBIND=1
+    TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
+  )
+  set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
+  )
+else()
+  if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
+    message(STATUS
+      "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
+  else()
+    message(STATUS
+      "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
+      "CUDA_ARCHS='${CUDA_ARCHS}'.")
+  endif()
+endif()
--- a/cmake/external_projects/triton_kernels.cmake
+++ b/cmake/external_projects/triton_kernels.cmake
+# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
+set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
+# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
+# be directly set to the triton_kernels python directory.
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
+  FetchContent_Declare(
+          triton_kernels
+          SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
+  )
+else()
+  set(TRITON_GIT "https://github.com/triton-lang/triton.git")
+  message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
+  FetchContent_Declare(
+          triton_kernels
+          # TODO (varun) : Fetch just the triton_kernels directory from Triton
+          GIT_REPOSITORY https://github.com/triton-lang/triton.git
+          GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
+          GIT_PROGRESS TRUE
+          SOURCE_SUBDIR python/triton_kernels/triton_kernels
+  )
+endif()
+# Fetch content
+FetchContent_MakeAvailable(triton_kernels)
+if (NOT triton_kernels_SOURCE_DIR)
+  message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
+endif()
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
+else()
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
+endif()
+message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
+add_custom_target(triton_kernels)
+# Ensure the vllm/third_party directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
+## Copy .py files to install directory.
+install(DIRECTORY
+        ${TRITON_KERNELS_PYTHON_DIR}
+        DESTINATION
+        vllm/third_party/triton_kernels/
+        COMPONENT triton_kernels
+        FILES_MATCHING PATTERN "*.py")
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3),
+# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files).
+# If no component is specified, vllm-flash-attn is still installed.
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(
+          vllm-flash-attn SOURCE_DIR 
+          ${VLLM_FLASH_ATTN_SRC_DIR}
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
+          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+endif()
+# Make sure vllm-flash-attn install rules are nested under vllm/
+# ALL_COMPONENTS ensures the save/modify/restore runs exactly once regardless
+# of how many components are being installed, avoiding double-append of /vllm/.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+# Restore the install prefix after FA's install rules
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+# Install shared Python files for both FA2 and FA3 components
+foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
+  # Ensure the vllm/vllm_flash_attn directory exists before installation
+  install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")"
+    COMPONENT ${_FA_COMPONENT})
+  # Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py
+  # which are source-controlled in vllm)
+  install(
+    DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+    DESTINATION vllm/vllm_flash_attn
+    COMPONENT ${_FA_COMPONENT}
+    FILES_MATCHING PATTERN "*.py"
+    PATTERN "__init__.py" EXCLUDE
+    PATTERN "flash_attn_interface.py" EXCLUDE
+  )
+endforeach()
+#
+# FA4 CuteDSL component
+# This is a Python-only component that copies the flash_attn/cute directory
+# and transforms imports to match our package structure.
+#
+add_custom_target(_vllm_fa4_cutedsl_C)
+# Copy flash_attn/cute directory (needed for FA4) and transform imports
+# The cute directory uses flash_attn.cute imports internally, which we replace
+# with vllm.vllm_flash_attn.cute to match our package structure.
+install(CODE "
+  file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
+  foreach(SRC_FILE \${CUTE_PY_FILES})
+    file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
+    set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
+    get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
+    file(MAKE_DIRECTORY \${DST_DIR})
+    file(READ \${SRC_FILE} FILE_CONTENTS)
+    string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
+    file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
+  endforeach()
+" COMPONENT _vllm_fa4_cutedsl_C)