Commit fbeb8a6f authored by raojy's avatar raojy
Browse files

raw_vllm

parent 2ca8867f
Pipeline #3454 canceled with stages
# DeepSeek DeepGEMM Kernels Benchmark
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
Currently, this just includes dense GEMMs and only works on Hopper GPUs.
## Setup
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
```bash
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
cd DeepGEMM
python setup.py install
uv pip install -e .
```
## Usage
```console
python benchmark_fp8_block_dense_gemm.py
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
===== STARTING FP8 GEMM BENCHMARK =====
PyTorch version: 2.5.1+cu124
CUDA version: 12.4
Triton version: 3.1.0
Using device: NVIDIA H100 80GB HBM3
WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
===== PERFORMANCE COMPARISON =====
DeepGEMM Implementation:
+------+-------+-------+-----------+--------+--------+
| m | n | k | Time (μs) | TFLOPS | GB/s |
+------+-------+-------+-----------+--------+--------+
| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 |
| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 |
| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 |
| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 |
| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 |
| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 |
| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 |
| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 |
| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 |
| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 |
| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 |
| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 |
| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 |
| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 |
| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 |
| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 |
+------+-------+-------+-----------+--------+--------+
vLLM Triton Implementation:
+------+-------+-------+-----------+--------+--------+--------------+
| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM |
+------+-------+-------+-----------+--------+--------+--------------+
| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster |
| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower |
| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower |
| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower |
| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower |
| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower |
| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster |
| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster |
| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower |
| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster |
| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower |
| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower |
| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower |
| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower |
| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower |
| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower |
+------+-------+-------+-----------+--------+--------+--------------+
vLLM CUTLASS Implementation:
+------+-------+-------+-----------+--------+--------+--------------+--------------+
| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster |
| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster |
| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster |
| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster |
| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster |
| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster |
| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster |
| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster |
| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster |
| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster |
| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster |
| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster |
| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster |
| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster |
| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster |
| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
===== AVERAGE PERFORMANCE =====
+----------------+------------+----------+---------------+
| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
+----------------+------------+----------+---------------+
| DeepGEMM | 310.98 | 1052.10 | 0.11 |
| vLLM Triton | 144.30 | 715.60 | 0.23 |
| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 |
+----------------+------------+----------+---------------+
===== AVERAGE SPEEDUPS =====
+-----------------------------+--------------+
| Comparison | Speedup |
+-----------------------------+--------------+
| DeepGEMM vs vLLM Triton | 1.71x faster |
| DeepGEMM vs vLLM CUTLASS | 0.94x slower |
| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
+-----------------------------+--------------+
===== ACCURACY COMPARISON =====
+----------------+-----------------------+
| Implementation | Avg Diff vs Reference |
+----------------+-----------------------+
| DeepGEMM | 0.000684 |
| vLLM Triton | 0.000684 |
| vLLM CUTLASS | 0.000684 |
+----------------+-----------------------+
```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
import time
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8,
w8a8_triton_block_scaled_mm,
)
from vllm.triton_utils import triton
from vllm.utils.deep_gemm import (
calc_diff,
fp8_gemm_nt,
per_block_cast_to_fp8,
)
def benchmark_shape(
m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False,
) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
C_ref = A @ B.t()
# Pre-quantize B for all implementations
# (weights can be pre-quantized offline)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
# Block size configuration
block_size = [128, 128]
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True, tma_aligned_scales=True
)
C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True
)
# === DeepGEMM Implementation ===
def deepgemm_gemm():
fp8_gemm_nt(
(A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
)
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
return w8a8_triton_block_scaled_mm(
A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16,
)
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
return ops.cutlass_scaled_mm(
A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16,
)
# Run correctness check first
if verbose:
print("Running correctness check...")
C_deepgemm = deepgemm_gemm()
C_vllm_triton = vllm_triton_gemm()
C_vllm_cutlass = vllm_cutlass_gemm()
deepgemm_diff = calc_diff(C_deepgemm, C_ref)
vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
if verbose:
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
print(
"vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
)
print(
"vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
)
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm,
}
benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
for name, func in implementations.items():
# Warmup
for _ in range(warmup):
func()
torch.cuda.synchronize()
# Timing loop
torch.cuda.synchronize()
start = time.time()
for _ in range(repeat):
func()
torch.cuda.synchronize()
end = time.time()
# Calculate timing and TFLOPS
avg_time_ms = (end - start) / repeat * 1000
avg_time_us = avg_time_ms * 1000
tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
benchmark_results["implementations"][name] = {
"time_ms": avg_time_ms,
"time_us": avg_time_us,
"tflops": tflops,
"gb_s": gb_s,
"diff": {
"DeepGEMM": 0.0
if name == "DeepGEMM"
else calc_diff(func(), C_deepgemm),
"Reference": deepgemm_diff
if name == "DeepGEMM"
else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
},
}
if verbose:
print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
if verbose:
print(
f"DeepGEMM is {1 / speedup:.2f}x "
f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
)
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
cutlass_vs_triton
)
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
)
return benchmark_results
def format_table_row(values, widths):
"""Format a row with specified column widths."""
return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
"""Print a table with headers and rows."""
if title:
print(f"\n{title}")
# Calculate column widths based on headers and data
widths = [
max(len(str(h)), max(len(str(row[i])) for row in rows))
for i, h in enumerate(headers)
]
# Create separator line
separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
# Print table
print(separator)
print(format_table_row(headers, widths))
print(separator)
for row in rows:
print(format_table_row(row, widths))
print(separator)
def format_speedup(value):
"""Format speedup value with indicator if it's faster or slower."""
return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
def run_benchmarks(verbose: bool = False):
"""Run benchmarks for a set of common shapes."""
print("===== STARTING FP8 GEMM BENCHMARK =====")
# Make sure we're using the GPU
if not torch.cuda.is_available():
print("CUDA not available! Tests require GPU.")
return
# Print system information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Triton version: {triton.__version__}")
print(f"Using device: {torch.cuda.get_device_name()}")
# Enable TF32 for better performance
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)
# Define benchmark shapes (m, n, k)
shapes = [
(8, 4096, 7168),
(8, 7168, 18432),
(8, 18432, 7168),
(64, 4096, 7168),
(64, 7168, 18432),
(64, 18432, 7168),
(64, 24576, 1536),
(64, 32768, 512),
(64, 7168, 16384),
(128, 4096, 7168),
(128, 7168, 18432),
(128, 18432, 7168),
(1024, 4096, 7168),
(1024, 18432, 7168),
(2048, 4096, 7168),
(4096, 4096, 7168),
]
shapes = [
# (64, 2112, 7168),
(64, 24576, 1536),
(64, 32768, 512),
(64, 7168, 16384),
(64, 4096, 7168),
(64, 7168, 2048),
# (128, 2112, 7168),
(128, 24576, 1536),
(128, 32768, 512),
(128, 7168, 16384),
(128, 4096, 7168),
(128, 7168, 2048),
# (4096, 2112, 7168),
(4096, 24576, 1536),
(4096, 32768, 512),
(4096, 7168, 16384),
(4096, 4096, 7168),
(4096, 7168, 2048),
]
all_results = []
for m, n, k in shapes:
result = benchmark_shape(m, n, k, verbose=verbose)
all_results.append(result)
# Print results in a nicely formatted table
print("\n===== PERFORMANCE COMPARISON =====")
# Print DeepGEMM table
deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
deepgemm_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
deepgemm_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
]
)
print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
# Print vLLM Triton table
triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
triton_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(speedup),
]
)
print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
"m",
"n",
"k",
"Time (μs)",
"TFLOPS",
"GB/s",
"vs DeepGEMM",
"vs Triton",
]
cutlass_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
cutlass_rows.append(
[
shape["m"],
shape["n"],
shape["k"],
f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}",
f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton),
]
)
print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
}
for result in all_results:
for impl in implementations:
impl_data = result["implementations"][impl]
avg_metrics[impl]["tflops"] += impl_data["tflops"]
avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
num_shapes = len(all_results)
avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
avg_rows = []
for impl in implementations:
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
avg_rows.append(
[impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
)
print_table(avg_headers, avg_rows)
# Calculate average speedups
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
"vLLM CUTLASS vs vLLM Triton": 0,
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
vllm_triton_time / vllm_cutlass_time
)
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
speedup_rows = []
for comparison, total in avg_speedups.items():
avg_speedup = total / num_shapes
status = "faster" if avg_speedup > 1 else "slower"
speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
print_table(speedup_headers, speedup_rows)
# Average accuracy comparison
print("\n===== ACCURACY COMPARISON =====")
avg_diff = {impl: 0 for impl in implementations}
for result in all_results:
for impl in implementations:
avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []
for impl in implementations:
diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
print_table(diff_headers, diff_rows)
if __name__ == "__main__":
run_benchmarks(verbose=False)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import regex as re
import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement
from vllm.utils.argparse_utils import FlexibleArgumentParser
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion."
)
parser.add_argument("filename", type=str)
args = parser.parse_args()
with open(args.filename, "rb") as f:
data = pickle.load(f)
raw_results: list[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
for v in raw_results:
result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
if result is not None:
KN = result.group(1)
else:
raise Exception("MKN not found")
result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
if result is not None:
M = result.group(1)
else:
raise Exception("MKN not found")
kernel = v.task_spec.description
results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median})
rows = int(math.ceil(len(results) / 2))
fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
axs = axs.flatten()
for axs_idx, (shape, data) in enumerate(results.items()):
plt.sca(axs[axs_idx])
df = pd.DataFrame(data)
sns.lineplot(
data=df,
x="batch_size",
y="median",
hue="kernel",
style="kernel",
markers=True,
dashes=False,
palette="Dark2",
)
plt.title(f"Shape: {shape}")
plt.ylabel("time (median, s)")
plt.tight_layout()
plt.savefig("graph_machete_bench.pdf")
pandas
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Callable, Iterable
from typing import Any
import torch
import torch.utils.benchmark as TBenchmark
from torch.utils.benchmark import Measurement as TMeasurement
@dataclasses.dataclass
class CudaGraphBenchParams:
num_ops_in_cuda_graph: int
@dataclasses.dataclass
class ArgPool:
"""
When some argument of the benchmarking function is annotated with this type,
the benchmarking class (BenchMM) will collapse the argument to a pick a
single value from the given list of values, during function invocation.
For every invocation during a benchmarking run, it will choose a
different value from the list.
"""
values: Iterable[Any]
def __getitem__(self, index):
return self.values[index]
class Bench:
class ArgsIterator:
def __init__(self, args_list, kwargs_list):
assert len(args_list) == len(kwargs_list)
self.args_list = args_list
self.kwargs_list = kwargs_list
self.n = len(self.args_list)
self.idx = 0
def __next__(self):
while True:
yield (self.args_list[self.idx], self.kwargs_list[self.idx])
self.idx += 1
self.idx = self.idx % self.n
def reset(self):
self.idx = 0
@property
def n_args(self):
return self.n
def __init__(
self,
cuda_graph_params: CudaGraphBenchParams | None,
label: str,
sub_label: str,
description: str,
fn: Callable,
*args,
**kwargs,
):
self.cuda_graph_params = cuda_graph_params
self.use_cuda_graph = self.cuda_graph_params is not None
self.label = label
self.sub_label = sub_label
self.description = description
self.fn = fn
# Process args
self._args = args
self._kwargs = kwargs
self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs)
self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list)
# Cudagraph runner
self.g = None
if self.use_cuda_graph:
self.g = self.get_cuda_graph_runner()
# benchmark run params
self.min_run_time = 1
def collapse_argpool(self, *args, **kwargs):
argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
arg for arg in kwargs.values() if isinstance(arg, ArgPool)
]
if len(argpool_args) == 0:
return [args], [kwargs]
# Make sure all argpools are of the same size
argpool_size = len(argpool_args[0].values)
assert all([argpool_size == len(arg.values) for arg in argpool_args])
# create copies of the args
args_list = []
kwargs_list = []
for _ in range(argpool_size):
args_list.append(args)
kwargs_list.append(kwargs.copy())
for i in range(argpool_size):
# collapse args; Just pick the ith value
args_list[i] = tuple(
[arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]]
)
# collapse kwargs
kwargs_i = kwargs_list[i]
arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)]
for k in arg_pool_keys:
# again just pick the ith value
kwargs_i[k] = kwargs_i[k][i]
kwargs_list[i] = kwargs_i
return args_list, kwargs_list
def get_cuda_graph_runner(self):
assert self.use_cuda_graph
assert self.args_iterator is not None
num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
# warmup
args_it = self.args_iterator.__next__()
for _ in range(2):
args, kwargs = next(args_it)
self.fn(*args, **kwargs)
self.args_iterator.reset()
args_it = self.args_iterator.__next__()
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
for _ in range(num_graph_ops):
args, kwargs = next(args_it)
self.fn(*args, **kwargs)
return g
def run_cudagrah(self) -> TMeasurement:
assert self.use_cuda_graph
globals = {"g": self.g}
return TBenchmark.Timer(
stmt="g.replay()",
globals=globals,
label=(
f"{self.label}"
f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
),
sub_label=self.sub_label,
description=self.description,
).blocked_autorange(min_run_time=self.min_run_time)
def run_eager(self) -> TMeasurement:
setup = None
stmt = None
globals = None
has_arg_pool = self.args_iterator.n_args > 1
if has_arg_pool:
setup = """
args_iterator.reset()
args_it = args_iterator.__next__()
"""
stmt = """
args, kwargs = next(args_it)
fn(*args, **kwargs)
"""
globals = {"fn": self.fn, "args_iterator": self.args_iterator}
else:
# no arg pool. Just use the args and kwargs directly
self.args_iterator.reset()
args_it = self.args_iterator.__next__()
args, kwargs = next(args_it)
setup = ""
stmt = """
fn(*args, **kwargs)
"""
globals = {"fn": self.fn, "args": args, "kwargs": kwargs}
return TBenchmark.Timer(
stmt=stmt,
setup=setup,
globals=globals,
label=self.label,
sub_label=self.sub_label,
description=self.description,
).blocked_autorange(min_run_time=self.min_run_time)
def run(self) -> TMeasurement:
timer = None
if self.use_cuda_graph: # noqa SIM108
timer = self.run_cudagrah()
else:
timer = self.run_eager()
if not timer.meets_confidence() or timer.has_warnings:
print("Doesn't meet confidence - re-running bench ...")
return self.run()
return timer
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
if exc_type:
print(f"exc type {exc_type}")
print(f"exc value {exc_value}")
print(f"exc traceback {traceback}")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# Example:
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
# - TP1 : K = 14336, N = 4096
# - TP2 : K = 7168, N = 4096
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
# - TP1 : K = 4096, N = 6144
# - TP4 : K = 4096, N = 1536
# TP1 shapes
WEIGHT_SHAPES = {
"mistralai/Mistral-7B-v0.1": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-7b-hf": [
([4096, 12288], 1),
([4096, 4096], 0),
([4096, 22016], 1),
([11008, 4096], 0),
],
"meta-llama/Llama-3-8b": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-13b-hf": [
([5120, 15360], 1),
([5120, 5120], 0),
([5120, 27648], 1),
([13824, 5120], 0),
],
"meta-llama/Llama-2-70b-hf": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"meta-llama/Llama-3.1-405b-hf": [
([16384, 18432], 1),
([16384, 16384], 0),
([16384, 106496], 1),
([53248, 16384], 0),
],
"meta-llama/Llama-3.1-8B-Instruct": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-3.3-70B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 57344], 1),
([28672, 8192], 0),
],
"mistralai/Mistral-Large-Instruct-2407": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 57344], 1),
([28672, 12288], 0),
],
"Qwen/Qwen2.5-7B-Instruct": [
([3584, 4608], 1),
([3584, 3584], 0),
([3584, 37888], 1),
([18944, 3584], 0),
],
"Qwen/Qwen2.5-32B-Instruct": [
([5120, 7168], 1),
([5120, 5120], 0),
([5120, 55296], 1),
([27648, 5120], 0),
],
"Qwen/Qwen2.5-72B-Instruct": [
([8192, 10240], 1),
([8192, 8192], 0),
([8192, 59136], 1),
([29568, 8192], 0),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
([2048, 3072], 1),
([2048, 4096], 1),
([2048, 2048], 0),
([2048, 576], 0),
([2048, 21888], 1),
([10944, 2048], 0),
([2048, 2816], 1),
([1408, 2048], 0),
],
"CohereLabs/c4ai-command-a-03-2025": [
([12288, 14336], 1),
([12288, 12288], 0),
([12288, 73728], 1),
([36864, 12288], 0),
],
}
# Benchmark KV Cache Offloading with Multi-Turn Conversations
The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt`
First start serving your model
```bash
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
vllm serve $MODEL_PATH --served-model-name Llama
```
The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
## Synthetic Multi-Turn Conversations
Download the following text file (used for generation of synthetic conversations)
```bash
wget https://www.gutenberg.org/ebooks/1184.txt.utf-8
mv 1184.txt.utf-8 pg1184.txt
```
The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`).
But you may use other text files if you prefer (using this specific file is not required).
Then run the benchmarking script
```bash
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
```
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
If successful, you will see the following output
```bash
----------------------------------------------------------------------------------------------------
Statistics summary:
runtime_sec = 215.810
requests_per_sec = 0.769
----------------------------------------------------------------------------------------------------
count mean std min 25% 50% 75% 90% 99% max
ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54
tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05
latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94
input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00
input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00
output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00
output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00
----------------------------------------------------------------------------------------------------
```
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
benchmark-only runtime so the reported throughput stays comparable).
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
The file `generate_multi_turn.json` is an example file.
The file must contain the sections `prompt_input` and `prompt_output`.
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
The final value will always be rounded to an even number so each user turn has a reply.
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
* `num_tokens` - Total token length of each **user** message (one turn).
The `prompt_output` section must contain `num_tokens`:
* `num_tokens` - Total token length of each **assistant** message (one turn).
### Random distributions for synthetic conversations generation
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
The distribution determines how to randomly sample values for the field.
The available distributions are listed below.
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
#### constant
```json
{
"distribution": "constant",
"value": 500
}
```
* `value` - the fixed integer value (always returns the same number).
#### uniform
```json
{
"distribution": "uniform",
"min": 12,
"max": 18
}
```
* `min` - minimum value (inclusive).
* `max` - maximum value (inclusive), should be equal or larger than min.
#### lognormal
```json
{
"distribution": "lognormal",
"average": 1000,
"max": 5000
}
```
You can parameterize the lognormal distribution in one of two ways:
Using the average and optional median ratio:
* `average` - target average value of the distribution.
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
Using the parameters of the underlying normal distribution:
* `mean` - mean of the underlying normal distribution.
* `sigma` - standard deviation of the underlying normal distribution.
#### zipf
```json
{
"distribution": "zipf",
"alpha": 1.2,
"max": 100
}
```
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
#### poisson
```json
{
"distribution": "poisson",
"alpha": 10,
"max": 50
}
```
* `alpha` - expected value (λ). Also the variance of the distribution.
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json`
Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py`
```bash
python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128
```
The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles.
The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed).
Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple
import numpy as np # type: ignore
import pandas as pd # type: ignore
from bench_utils import (
TEXT_SEPARATOR,
Color,
logger,
)
from tqdm import tqdm
from transformers import AutoTokenizer # type: ignore
# Conversation ID is a string (e.g: "UzTK34D")
ConvId = str
# A list of dicts (dicts with keys "id" and "messages")
ShareGptConversations = list[dict[str, Any]]
# A list of dicts (dicts with keys "role" and "content")
MessagesList = list[dict[str, str]]
# Map conversation ID to conversation messages
ConversationsMap = list[ConvId, MessagesList]
class Distribution(ABC):
@abstractmethod
def sample(self, size: int = 1) -> np.ndarray:
pass
class UniformDistribution(Distribution):
def __init__(
self,
min_val: int | float,
max_val: int | float,
is_integer: bool = True,
) -> None:
self.min_val = min_val
self.max_val = max_val
self.is_integer = is_integer
def sample(self, size: int = 1) -> np.ndarray:
if self.is_integer:
return np.random.randint(
int(self.min_val), int(self.max_val + 1), size=size
)
else:
return np.random.uniform(self.min_val, self.max_val, size=size)
def __repr__(self) -> str:
return f"UniformDistribution[{self.min_val}, {self.max_val}]"
class ConstantDistribution(Distribution):
def __init__(self, value: int | float) -> None:
self.value = value
self.max_val = value
def sample(self, size: int = 1) -> np.ndarray:
return np.full(shape=size, fill_value=self.value)
def __repr__(self) -> str:
return f"Constant[{self.value}]"
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.zipf(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"ZipfDistribution[{self.alpha}]"
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: int | None = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.poisson(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"PoissonDistribution[{self.alpha}]"
class LognormalDistribution(Distribution):
def __init__(
self,
mean: float | None = None,
sigma: float | None = None,
average: int | None = None,
median_ratio: float | None = None,
max_val: int | None = None,
) -> None:
self.average = average
self.median_ratio = median_ratio
self.max_val = max_val
if average is not None:
if average < 1:
raise ValueError("Lognormal average must be positive")
if mean or sigma:
raise ValueError(
"When using lognormal average, you can't provide mean/sigma"
)
if self.median_ratio is None:
# Default value that provides relatively wide range of values
self.median_ratio = 0.85
# Calculate mean/sigma of np.random.lognormal based on the average
mean, sigma = self._generate_lognormal_by_median(
target_average=self.average, median_ratio=self.median_ratio
)
else:
if mean is None or sigma is None:
raise ValueError(
"Must provide both mean and sigma if average is not used"
)
if mean <= 0 or sigma < 0:
raise ValueError(
"Lognormal mean must be positive and sigma must be non-negative"
)
# Mean and standard deviation of the underlying normal distribution
# Based on numpy.random.lognormal
self.mean = mean
self.sigma = sigma
@staticmethod
def _generate_lognormal_by_median(
target_average: int, median_ratio: float
) -> tuple[float, float]:
"""
Compute (mu, sigma) for a lognormal distribution given:
- a target average (mean of the distribution)
- a ratio of median / mean (controls skewness), assume mean > median
Background:
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
* mean(X) = exp(mu + sigma^2 / 2)
* median(X) = exp(mu)
So:
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
= exp(-sigma^2 / 2)
Rearranging:
sigma^2 = 2 * ln(mean / median)
mu = ln(median)
This gives a unique (mu, sigma) for any valid mean and median.
"""
# Check input validity: median must be smaller than mean
if median_ratio <= 0 or median_ratio >= 1:
raise ValueError("median_ratio must be in range (0, 1)")
target_median = target_average * median_ratio
# Solve sigma^2 = 2 * ln(mean / median)
sigma = np.sqrt(2 * np.log(target_average / target_median))
mu = np.log(target_median)
return mu, sigma
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
if self.average is not None:
# Scale to average
samples *= self.average / samples.mean()
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
if self.average:
return (
f"LognormalDistribution[{self.average}, "
f"{self.median_ratio}, {self.max_val}]"
)
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
class GenConvArgs(NamedTuple):
num_conversations: int
text_files: list[str]
input_num_turns: Distribution
input_common_prefix_num_tokens: Distribution
input_prefix_num_tokens: Distribution
input_num_tokens: Distribution
output_num_tokens: Distribution
print_stats: bool
def verify_field_exists(
conf: dict, field_name: str, section: str, subsection: str
) -> None:
if field_name not in conf:
raise ValueError(
f"Missing field '{field_name}' in {section=} and {subsection=}"
)
def get_random_distribution(
conf: dict, section: str, subsection: str, optional: bool = False
) -> Distribution:
# section can be "prompt_input" or "prompt_output" (both required)
conf = conf[section]
if optional and subsection not in conf:
# Optional subsection, if not found assume the value is always 0
return ConstantDistribution(0)
# subsection can be "num_turns", "num_tokens" or "prefix_num_tokens"
if subsection not in conf:
raise ValueError(f"Missing subsection {subsection} in section {section}")
conf = conf[subsection]
distribution = conf.get("distribution")
if distribution is None:
raise ValueError(
f"Missing field 'distribution' in {section=} and {subsection=}"
)
if distribution == "constant":
verify_field_exists(conf, "value", section, subsection)
return ConstantDistribution(conf["value"])
elif distribution == "zipf":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return ZipfDistribution(conf["alpha"], max_val=max_val)
elif distribution == "poisson":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
max_val = conf.get("max", None)
if "average" in conf:
# Infer lognormal mean/sigma (numpy) from input average
median_ratio = conf.get("median_ratio", None)
return LognormalDistribution(
average=conf["average"], median_ratio=median_ratio, max_val=max_val
)
# Use mean/sigma directly (for full control over the distribution)
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
return LognormalDistribution(
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
)
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
verify_field_exists(conf, "max", section, subsection)
min_value = conf["min"]
max_value = conf["max"]
assert min_value > 0
assert min_value <= max_value
is_integer = isinstance(min_value, int) and isinstance(max_value, int)
return UniformDistribution(min_value, max_value, is_integer)
else:
raise ValueError(f"Unknown distribution: {distribution}")
def parse_input_json_file(conf: dict) -> GenConvArgs:
# Validate the input file
assert isinstance(conf, dict)
required_fields = [
"filetype",
"num_conversations",
"text_files",
"prompt_input",
"prompt_output",
]
for field in required_fields:
assert field in conf, f"Missing field {field} in input {conf}"
assert conf["filetype"] == "generate_conversations"
assert conf["num_conversations"] > 0, "num_conversations should be larger than zero"
text_files = conf["text_files"]
assert isinstance(text_files, list), "Field 'text_files' should be a list"
assert len(text_files) > 0, (
"Field 'text_files' should be a list with at least one file"
)
# Parse the parameters for the prompt input/output workload
input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns")
input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens")
input_common_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "common_prefix_num_tokens", optional=True
)
input_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "prefix_num_tokens"
)
output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens")
print_stats: bool = conf.get("print_stats", False)
assert isinstance(print_stats, bool), (
"Field 'print_stats' should be either 'true' or 'false'"
)
args = GenConvArgs(
num_conversations=conf["num_conversations"],
text_files=text_files,
input_num_turns=input_num_turns,
input_common_prefix_num_tokens=input_common_prefix_num_tokens,
input_prefix_num_tokens=input_prefix_num_tokens,
input_num_tokens=input_num_tokens,
output_num_tokens=output_num_tokens,
print_stats=print_stats,
)
return args
def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None:
# Collect statistics
conv_stats: list[dict[Any, Any]] = []
req_stats: list[int] = []
print("\nCollecting statistics...")
for messages in conversations.values():
# messages is a list of dicts
user_tokens: list[int] = []
assistant_tokens: list[int] = []
request_tokens: list[int] = []
req_tokens = 0
for m in messages:
content = m["content"]
num_tokens = len(tokenizer(content).input_ids)
if m["role"] == "user":
user_tokens.append(num_tokens)
# New user prompt including all chat history
req_tokens += num_tokens
request_tokens.append(req_tokens)
elif m["role"] == "assistant":
assistant_tokens.append(num_tokens)
# Update assistant answer
# (will be part of chat history for the next user prompt)
req_tokens += num_tokens
item_stats = {
"conversation_turns": len(messages),
"user_tokens": mean(user_tokens),
"assistant_tokens": mean(assistant_tokens),
}
conv_stats.append(item_stats)
req_stats.extend(request_tokens)
# Print statistics
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(conv_stats)
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Request statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(req_stats, columns=["request_tokens"])
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
def generate_conversations(
args: GenConvArgs, tokenizer: AutoTokenizer
) -> ConversationsMap:
# Text for all user prompts
# (text from the input text files will be appended to this line)
base_prompt_text = "Please rewrite the following text and add more content: "
base_prompt_token_count = len(
tokenizer.encode(base_prompt_text, add_special_tokens=False)
)
logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}")
logger.info(args)
list_of_tokens = []
for filename in args.text_files:
# Load text file that will be used to generate prompts
with open(filename) as file:
data = file.read()
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
list_of_tokens.extend(tokens_in_file)
logger.info(
f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
f"total tokens so far: {len(list_of_tokens)}"
)
conversations: ConversationsMap = {}
conv_id = 0
# Generate number of turns for every conversation
turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations)
# Turn count should be at least 2 (one user prompt and one assistant answer)
turn_count = np.maximum(turn_count, 2)
# Round up to an even number (every user prompt should have an answer)
turn_count = turn_count + (turn_count % 2)
# Generate number of prefix tokens for every conversation
conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample(
args.num_conversations
)
# Used to reduce shared text between conversations
# (jump/skip over text sections between conversations)
base_offset = 0
# Common prefix size for all conversations (only 1 sample required)
common_prefix_text = ""
common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0]
if common_prefix_tokens > 0:
# Using "." at the end to separate sentences
common_prefix_text = (
tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "."
)
base_offset += common_prefix_tokens
for conv_id in tqdm(
range(args.num_conversations),
total=args.num_conversations,
desc="Generating conversations",
unit="conv",
):
# Generate a single conversation
messages: MessagesList = []
nturns = turn_count[conv_id]
# User prompt token count per turn (with lower limit)
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
# Assistant answer token count per turn (with lower limit)
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
int
)
output_token_count = np.maximum(output_token_count, 1)
user_turn = True
for turn_id in range(nturns):
if user_turn:
role = "user"
num_tokens = input_token_count[turn_id]
# Generate the user prompt,
# use a unique prefix (the conv_id) for each conversation
# (to avoid shared prefix between conversations)
content = f"{conv_id} is a nice number... "
if len(common_prefix_text) > 0 and turn_id == 0:
content = common_prefix_text + content
# Update the number of tokens left for the content
num_tokens -= len(tokenizer.encode(content, add_special_tokens=False))
if turn_id == 0:
prefix_num_tokens = conv_prefix_tokens[conv_id]
if prefix_num_tokens > 0:
# Add prefix text (context) to the first turn
start_offset = base_offset
end_offset = start_offset + prefix_num_tokens
assert len(list_of_tokens) > end_offset, (
"Not enough input text to generate "
f"{prefix_num_tokens} tokens for the "
f"prefix text ({start_offset=}, {end_offset=})"
)
content += f"{conv_id}, " + tokenizer.decode(
list_of_tokens[start_offset:end_offset]
)
base_offset += prefix_num_tokens
# Add the actual user prompt/question after the prefix text
content += base_prompt_text
num_tokens -= base_prompt_token_count
if num_tokens > 0:
# Add text from the input file (to reach the desired token count)
start_offset = base_offset + turn_id * input_token_count.max()
end_offset = start_offset + num_tokens
assert len(list_of_tokens) > end_offset, (
f"Not enough input text to generate {num_tokens} tokens "
f"for the prompt ({start_offset=}, {end_offset=})"
)
# Convert tokens back to text
content += tokenizer.decode(list_of_tokens[start_offset:end_offset])
else:
role = "assistant"
# This content will not be used as input to the LLM server
# (actual answers will be used instead).
# Content is only required to determine the min_tokens/max_tokens
# (inputs to the LLM server).
num_tokens = output_token_count[turn_id]
assert len(list_of_tokens) > num_tokens, (
f"Not enough input text to generate {num_tokens} "
"tokens for assistant content"
)
content = tokenizer.decode(list_of_tokens[:num_tokens])
# Append the user/assistant message to the list of messages
messages.append({"role": role, "content": content})
user_turn = not user_turn
# Add the new conversation
conversations[f"CONV_ID_{conv_id}"] = messages
# Increase base offset for the next conversation
base_offset += nturns
if args.print_stats:
print_conv_stats(conversations, tokenizer)
return conversations
def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap:
conversations: ConversationsMap = {}
for item in input_list:
conv_id: str = item["id"]
assert isinstance(conv_id, str)
assert conv_id not in conversations, (
f"Conversation ID {conv_id} found more than once in the input"
)
messages: MessagesList = item["messages"]
assert isinstance(messages, list), (
f"Conversation messages should be a list (ID: {conv_id})"
)
assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})"
conversations[conv_id] = messages
logger.info(f"Using {len(conversations)} unique conversations (IDs)")
assert len(conversations) == len(input_list)
# Print statistics about the selected conversations
stats: list[dict[str, Any]] = []
for conv_data in conversations.values():
stats.append({"num_turns": len(conv_data)})
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles)
print(conv_stats.transpose())
print(TEXT_SEPARATOR)
return conversations
def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations:
output: ShareGptConversations = []
for conv_id, conv_data in input_dict.items():
new_item = {"id": conv_id, "messages": conv_data}
output.append(new_item)
return output
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
from enum import Enum
class Color(Enum):
RED = "\033[91m"
GREEN = "\033[92m"
BLUE = "\033[94m"
PURPLE = "\033[95m"
CYAN = "\033[96m"
YELLOW = "\033[93m"
RESET = "\033[0m"
def __str__(self):
return self.value
TEXT_SEPARATOR = "-" * 100
# Configure the logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] - %(message)s",
datefmt="%d-%m-%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import asyncio
import json
import logging
import multiprocessing as mp
import os
import random
import time
from collections import Counter, deque
from datetime import datetime
from enum import Enum
from http import HTTPStatus
from statistics import mean
from typing import NamedTuple
import aiohttp # type: ignore
import numpy as np # type: ignore
import pandas as pd # type: ignore
from bench_dataset import (
ConversationsMap,
ConvId,
GenConvArgs,
MessagesList,
ShareGptConversations,
conversations_dict_to_list,
conversations_list_to_dict,
generate_conversations,
parse_input_json_file,
)
from bench_utils import TEXT_SEPARATOR, Color, logger
from transformers import AutoTokenizer # type: ignore
NUM_TOKENS_FROM_DATASET = 0
TERM_SIGNAL = None
class ConversationSampling(str, Enum):
ROUND_ROBIN = "round_robin"
RANDOM = "random"
def __str__(self):
return self.value
class ClientArgs(NamedTuple):
seed: int
max_num_requests: int | None
skip_first_turn: bool
max_turns: int | None
max_active_conversations: int
verbose: bool
print_content: bool
verify_output: bool
conversation_sampling: ConversationSampling
request_rate: float
max_retries: int
class RequestArgs(NamedTuple):
chat_url: str
model: str
stream: bool
limit_min_tokens: int # Use negative value for no limit
limit_max_tokens: int # Use negative value for no limit
timeout_sec: int
class BenchmarkArgs(NamedTuple):
url: str
num_clients: int
early_stop: bool
class ServerResponse(NamedTuple):
valid: bool
ttft_ms: float # time to first chunk
tpot_ms: float # time per output chunk (one or more tokens)
latency_ms: float
start_time_ms: float
first_chunk: str # first chunk of the content
content: str # includes the first_chunk
num_chunks: int
def __str__(self) -> str:
return f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}" # noqa: E501
class RequestStats(NamedTuple):
ttft_ms: float
tpot_ms: float
latency_ms: float
start_time_ms: float
input_num_turns: int
input_num_tokens: int
output_num_tokens: int
output_num_chunks: int
output_num_first_chunk_tokens: int
approx_cached_percent: float
conversation_id: str
client_id: int
def __str__(self) -> str:
return (
f"ttft_ms {self.ttft_ms:.2f}, tpot_ms {self.tpot_ms:.2f}, latency_ms {self.latency_ms:.2f}, input_num_tokens {self.input_num_tokens}, " # noqa: E501
f"output_num_tokens {self.output_num_tokens} ({self.output_num_chunks} chunks, {self.output_num_first_chunk_tokens} tokens in first chunk), " # noqa: E501
f"approx_cached_percent {self.approx_cached_percent:.2f}%"
)
class MetricStats:
def __init__(self) -> None:
self.min: float | None = None
self.max: float | None = None
self.avg: float | None = None
self.sum = 0.0
self.count = 0
def update(self, value: float) -> None:
if self.min is None:
self.min = value
else:
self.min = min(self.min, value)
if self.max is None:
self.max = value
else:
self.max = max(self.max, value)
self.sum += value
self.count += 1
self.avg = self.sum / self.count
def __repr__(self) -> str:
if self.count == 0:
return "no data"
return f"avg: {self.avg:>10.3f}, min: {self.min:>10.3f}, max: {self.max:>10.3f}"
class MovingAverage:
def __init__(self, window_size: int) -> None:
self.window_size = window_size
self.window = np.zeros(window_size)
self.index = 0
self.sum = 0.0
self.count = 0
self.avg: float | None = None
def update(self, new_value: float) -> None:
if self.count < self.window_size:
# Filling up the window
self.sum += new_value
self.window[self.count] = new_value
self.count += 1
else:
# Window is full, start replacing old values
old_value = self.window[self.index]
self.sum = self.sum - old_value + new_value
self.window[self.index] = new_value
self.index = (self.index + 1) % self.window_size
self.avg = self.sum / self.count
def __repr__(self) -> str:
if self.count == 0:
return "no data"
return f"avg: {self.avg:>10.3f} ({self.count} samples)"
class DebugStats:
def __init__(self, logger: logging.Logger, window_size: int) -> None:
self.logger = logger
self.metrics: dict[str, MovingAverage | MetricStats] = {
"moving_avg_ttft_ms": MovingAverage(window_size),
"moving_avg_tpot_ms": MovingAverage(window_size),
"ttft_ms": MetricStats(),
"tpot_ms": MetricStats(),
"latency_ms": MetricStats(),
"input_num_turns": MetricStats(),
"input_num_tokens": MetricStats(),
"output_num_tokens": MetricStats(),
}
def update(self, data: RequestStats) -> None:
self.metrics["ttft_ms"].update(data.ttft_ms)
self.metrics["moving_avg_ttft_ms"].update(data.ttft_ms)
self.metrics["tpot_ms"].update(data.tpot_ms)
self.metrics["moving_avg_tpot_ms"].update(data.tpot_ms)
self.metrics["latency_ms"].update(data.latency_ms)
self.metrics["input_num_turns"].update(data.input_num_turns)
self.metrics["input_num_tokens"].update(data.input_num_tokens)
self.metrics["output_num_tokens"].update(data.output_num_tokens)
def print(self) -> None:
self.logger.info("-" * 50)
for k, v in self.metrics.items():
kv_info = f"[{k:25}] {v}"
self.logger.info(kv_info)
self.logger.info("-" * 50)
def nanosec_to_millisec(value: float) -> float:
return value / 1000000.0
def nanosec_to_sec(value: float) -> float:
return value / 1000000000.0
async def send_request(
session: aiohttp.ClientSession,
messages: list[dict[str, str]],
chat_url: str,
model: str,
stream: bool = True,
min_tokens: int | None = None,
max_tokens: int | None = None,
timeout_sec: int = 120,
) -> ServerResponse:
payload = {
"model": model,
"messages": messages,
"seed": 0,
"temperature": 0.0,
}
if stream:
payload["stream"] = True
payload["stream_options"] = {"include_usage": False}
if min_tokens is not None:
payload["min_tokens"] = min_tokens
if max_tokens is not None:
payload["max_tokens"] = max_tokens
headers = {"Content-Type": "application/json"}
# Calculate the timeout for the request
if max_tokens is not None:
# Assume TPOT of 200ms and use max_tokens to determine timeout
token_based_timeout = int(max_tokens * 0.2)
if token_based_timeout > timeout_sec:
timeout_sec = token_based_timeout
logger.info(
"Using timeout of %ds based on max_tokens %d",
timeout_sec,
max_tokens,
)
timeout = aiohttp.ClientTimeout(total=timeout_sec)
valid_response = True
ttft: float | None = None
chunk_delay: list[int] = []
latency: float | None = None
first_chunk = ""
generated_text = ""
start_time: int = time.perf_counter_ns()
most_recent_timestamp: int = start_time
async with session.post(
url=chat_url, json=payload, headers=headers, timeout=timeout
) as response:
http_status = HTTPStatus(response.status)
if http_status == HTTPStatus.OK:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk == "[DONE]":
# End of stream
latency = time.perf_counter_ns() - start_time
elif stream is False:
data = json.loads(chunk)
message = data["choices"][0]["message"]
assert message["role"] == "assistant"
generated_text += message["content"]
else:
timestamp: int = time.perf_counter_ns()
data = json.loads(chunk)
# Delta is the new content/text/data
delta = data["choices"][0]["delta"]
if delta.get("content", None):
if ttft is None:
# First token
first_token_time = time.perf_counter_ns()
ttft = first_token_time - start_time
first_chunk = delta["content"]
else:
# Decoding phase
chunk_delay.append(timestamp - most_recent_timestamp)
generated_text += delta["content"]
most_recent_timestamp = timestamp
else:
valid_response = False
content = await response.text()
logger.warning(
f"{Color.YELLOW}Received HTTP status {http_status.value} "
f"({http_status.phrase}): {content}{Color.RESET}"
)
if latency is None:
latency = -1.0
if valid_response:
# Streaming is disabled, latency was not set
latency = time.perf_counter_ns() - start_time
if ttft is None:
# The response was a single chunk
ttft = latency
# Each chunk may include more than one token
tpot: float = mean(chunk_delay) if len(chunk_delay) > 0 else 0.0
num_chunks: int = len(chunk_delay)
sr = ServerResponse(
valid=valid_response,
ttft_ms=nanosec_to_millisec(ttft) if ttft > 0.0 else -1.0,
tpot_ms=nanosec_to_millisec(tpot),
latency_ms=nanosec_to_millisec(latency),
start_time_ms=nanosec_to_millisec(start_time),
first_chunk=first_chunk,
content=generated_text,
num_chunks=num_chunks,
)
return sr
def get_short_string(input: str) -> str:
n = 20
if len(input) < 400:
return input
return f"{input[:n]}...{input[-n:]}"
def get_token_count(tokenizer: AutoTokenizer, text: str) -> int:
return len(tokenizer(text, add_special_tokens=False).input_ids)
def get_messages_token_count(
tokenizer: AutoTokenizer, messages: list[dict[str, str]]
) -> int:
token_count = 0
for m in messages:
token_count += get_token_count(tokenizer, m["content"])
return token_count
async def send_turn(
session: aiohttp.ClientSession,
client_id: int,
conv_id: str,
conversation_messages: MessagesList,
messages_to_use: int,
tokenizer: AutoTokenizer,
req_args: RequestArgs,
verbose: bool,
verify_output: bool,
) -> RequestStats | None:
assert messages_to_use > 0
assert messages_to_use <= len(conversation_messages)
messages = conversation_messages[:messages_to_use]
# Index of the next message (the role should be "user")
index = messages_to_use - 1
# Verify that the message has only two keys, "role" and "content"
assert len(messages[index].keys()) == 2
assert "role" in messages[index] and "content" in messages[index]
assert messages[index]["role"] == "user", (
f"Failed on conversation ID {conv_id}, message role should be user"
)
if verbose:
print(
f"{Color.CYAN}Messages (conversation ID {conv_id},"
f" {len(messages)} turns):{Color.RESET}",
messages,
)
# None means that there is no upper/lower limit for the output token count
min_tokens = None if req_args.limit_min_tokens < 0 else req_args.limit_min_tokens
max_tokens = None if req_args.limit_max_tokens < 0 else req_args.limit_max_tokens
if len(conversation_messages) > messages_to_use:
# The conversation contains an assistant answer for the next user prompt
if (
min_tokens == NUM_TOKENS_FROM_DATASET
or max_tokens == NUM_TOKENS_FROM_DATASET
):
# Compute number of tokens in the answer (from the input conversation)
assistant_answer = conversation_messages[messages_to_use]
answer_num_tokens = get_token_count(tokenizer, assistant_answer["content"])
assert assistant_answer["role"] == "assistant"
if min_tokens == NUM_TOKENS_FROM_DATASET:
min_tokens = max(1, answer_num_tokens)
if max_tokens == NUM_TOKENS_FROM_DATASET:
max_tokens = max(1, answer_num_tokens)
# Send the current conversation to LLM and get a response
response: ServerResponse = await send_request(
session,
messages,
req_args.chat_url,
req_args.model,
req_args.stream,
min_tokens,
max_tokens,
req_args.timeout_sec,
)
if response.valid is False:
# Request failed
return None
# Compute number of tokens in input / output
input_num_tokens = get_messages_token_count(tokenizer, messages)
# Num tokens in the user's last question
question_num_tokens = get_token_count(tokenizer, messages[index]["content"])
# Num tokens in the history/context of the question
assert input_num_tokens >= question_num_tokens
history_num_tokens = input_num_tokens - question_num_tokens
# Num tokens in the LLM's answer (first chunk and full answer)
first_chunk_tokens = get_token_count(tokenizer, response.first_chunk)
output_content = response.content
output_num_tokens = get_token_count(tokenizer, output_content)
# Prefix caching approximated cached percent
approx_cached_percent = (
100.0 * (history_num_tokens / input_num_tokens) if input_num_tokens > 0 else 0.0
)
# Compute the correct TTFT and TPOT (based on tokens and not chunks).
# Required because multiple output tokens may be bundled in a single chunk.
if output_num_tokens > 1 and output_num_tokens > first_chunk_tokens:
# More than one token and more than one chunk in the output
decode_ms = response.latency_ms - response.ttft_ms
decode_num_tokens = output_num_tokens - first_chunk_tokens
tpot_ms = decode_ms / decode_num_tokens
else:
# In this case: output_num_tokens == first_chunk_tokens
# Output was a single chunk (output_num_tokens > 1)
# or even a single token (output_num_tokens == 1)
tpot_ms = 0.0
if first_chunk_tokens > 1:
# First chunk had multiple tokens, adjust TTFT for a single token
delta_ms = (first_chunk_tokens - 1) * tpot_ms
ttft_ms = max(0.1, response.ttft_ms - delta_ms)
else:
# First chunk had only one token
ttft_ms = response.ttft_ms
rs = RequestStats(
ttft_ms=ttft_ms,
tpot_ms=tpot_ms,
latency_ms=response.latency_ms,
start_time_ms=response.start_time_ms,
input_num_turns=len(messages),
input_num_tokens=input_num_tokens,
output_num_tokens=output_num_tokens,
output_num_chunks=response.num_chunks,
output_num_first_chunk_tokens=first_chunk_tokens,
approx_cached_percent=approx_cached_percent,
conversation_id=conv_id,
client_id=client_id,
)
if verbose:
print(
f"\n{Color.YELLOW}Response ({output_num_tokens} tokens):{Color.RESET}",
output_content,
)
print(f"{Color.YELLOW}Response metrics: {rs}{Color.RESET}")
print("-" * 70)
# Save the LLM's answer (will be used as part of the context for the next user turn)
answer_index = messages_to_use
if len(conversation_messages) > answer_index:
assert conversation_messages[answer_index]["role"] == "assistant", (
f"Failed on conversation ID {conv_id}, message role should be assistant"
)
orig_content = conversation_messages[answer_index]["content"]
if verify_output:
# Compare the new answer to the answer from the input file
debug_info = (
f"LLM/dataset answers do not match ({conv_id}):"
f"\n'{get_short_string(output_content)}' (len: {len(output_content)}),"
f"\n'{get_short_string(orig_content)}' (len: {len(orig_content)})"
)
if orig_content != output_content:
raise ValueError(debug_info)
# Update the answer
conversation_messages[answer_index]["content"] = output_content
else:
# A user prompt that has no answer, add the answer as a new message
new_answer = {"role": "assistant", "content": output_content}
conversation_messages.append(new_answer)
return rs
async def poisson_sleep(request_rate: float, verbose: bool = False) -> None:
# Generate a random time interval from the Poisson distribution
assert request_rate > 0
interval = np.random.exponential(1.0 / request_rate)
if verbose:
logger.info(f"Sleeping for {interval:.3f} seconds...")
await asyncio.sleep(interval)
async def exponential_backoff_sleep(
attempt_cnt: int,
base_rate: float = 1.0,
backoff_factor: float = 2.0,
jitter_fraction: float = 0.10,
verbose: bool = False,
) -> None:
# Sleep with exponential backoff and jitter after a failed request.
backoff_delay = base_rate * (backoff_factor**attempt_cnt)
jittered_delay = backoff_delay * (
1 + np.random.uniform(-jitter_fraction, jitter_fraction)
)
if verbose:
logger.info(f"Backoff for {jittered_delay:.3f} seconds...")
await asyncio.sleep(jittered_delay)
async def client_main(
args: ClientArgs,
req_args: RequestArgs,
client_id: int,
tokenizer: AutoTokenizer,
stop_event: mp.Event, # type: ignore
task_queue: mp.Queue,
result_queue: mp.Queue,
conv_queue: mp.Queue,
) -> None:
logger.info(
f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501
)
# Set unique seed per client (each client runs in its own process)
# Add 1 to ensure no client uses the same seed as the main process
client_seed = args.seed + client_id + 1
random.seed(client_seed)
np.random.seed(client_seed)
# Active conversations
active_convs: ConversationsMap = {}
conv_id_queue: deque = deque(maxlen=args.max_active_conversations)
# Keep track of how many messages have been used for each conversation
turns_count: Counter = Counter()
num_successes = 0
num_failures = 0
# Track the timestamp (time.perf_counter())
# of the last turn per conversation (only for debug)
time_of_last_turn: dict[ConvId, float] = {}
# Flag that indicates that there are no new tasks (conversations) for the client
task_queue_empty = False
async with aiohttp.ClientSession() as session:
# Print progress
while task_queue_empty is False:
result = None
if (
args.max_num_requests
and num_successes + num_failures == args.max_num_requests
):
logger.info(
f"{Color.YELLOW}Client {client_id} reached "
f"request limit{Color.RESET}"
)
break
if stop_event.is_set(): # type: ignore
logger.info(
f"{Color.YELLOW}Client {client_id} received "
f"a termination signal{Color.RESET}"
)
break
while (
len(active_convs) < args.max_active_conversations
and task_queue_empty is False
):
# Get a new conversation from the task queue
conv_id, messages = task_queue.get()
if conv_id is TERM_SIGNAL:
task_queue_empty = True
break
if args.skip_first_turn:
# Skip the first turn (both user and assistant),
# relevant if warmup was enabled.
# Default turns_count[conv_id] will be zero if conv_id
# was never inserted/updated in turns_count.
turns_count[conv_id] += 2
if turns_count[conv_id] < len(messages):
# Add new conversation
active_convs[conv_id] = messages
conv_id_queue.append(conv_id)
if args.verbose:
logger.info(
f"{Color.GREEN}Client {client_id} will use conversation ID {conv_id} (active conversations {len(active_convs)}){Color.RESET}" # noqa: E501
)
elif args.verbose:
# No more messages (conversation finished during the warmup)
logger.info(
f"{Color.YELLOW}Client {client_id} will not use conversation ID {conv_id} (all {len(messages)} messages already sent){Color.RESET}" # noqa: E501
)
if len(active_convs) == 0 or task_queue_empty:
logger.info(
f"{Color.YELLOW}Client {client_id} has no more work{Color.RESET}"
)
break
# Pick an active conversation for the next request
if args.conversation_sampling == ConversationSampling.ROUND_ROBIN:
conv_id = conv_id_queue.pop()
else:
# ConversationSampling.RANDOM
active_ids = list(active_convs.keys())
conv_id = random.choice(active_ids)
messages = active_convs[conv_id]
assert isinstance(messages, list) and len(messages) > 0
# Update the amount of messages to use
turns_count[conv_id] += 1
current_turn = turns_count[conv_id]
assert current_turn < len(messages), (
f"Turn number {current_turn} is invalid for conversation ID {conv_id}"
f" that has only {len(messages)} messages"
)
if args.verbose:
curr_time_sec: float = time.perf_counter()
time_since_last_turn: str | float = "N/A"
if conv_id in time_of_last_turn:
time_since_last_turn = round(
curr_time_sec - time_of_last_turn[conv_id], 3
)
logger.info(
f"Client {client_id} using conversation ID {conv_id} (turn: {current_turn}, time since last turn [sec]: {time_since_last_turn})" # noqa: E501
)
time_of_last_turn[conv_id] = curr_time_sec
success = False
for attempt_cnt in range(args.max_retries + 1):
try:
exception = False
result = await send_turn(
session,
client_id,
conv_id,
messages,
current_turn,
tokenizer,
req_args,
args.print_content,
args.verify_output,
)
if result is not None:
result_queue.put(result)
success = True
break
else:
logger.warning(
f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
)
except asyncio.exceptions.TimeoutError:
exception = True
logger.error(
"%sClient %d - Timeout during conversation ID %s (turn: %d). "
"Base timeout is %ss (set with --request-timeout-sec), but the "
"effective timeout may be longer based on max_tokens. If this "
"is unexpected, consider increasing the timeout or checking "
"model performance.%s",
Color.RED,
client_id,
conv_id,
current_turn,
req_args.timeout_sec,
Color.RESET,
)
except Exception:
exception = True
logger.exception(
f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501
)
# Sleep before retry if not last attempt
if not success and attempt_cnt < args.max_retries:
await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose)
if not success:
num_failures += 1
# Remove the conversation (should not be used again)
active_convs.pop(conv_id)
if exception:
break # Exit gracefully instead of raising an error
else:
num_successes += 1
# Update the turns counter to include the LLM response
# The LLM response will be used as context for the next user turn
turns_count[conv_id] += 1
max_turns = len(messages)
if args.max_turns is not None:
# Limit the number of turns in the conversation
max_turns = min(args.max_turns, max_turns)
if turns_count[conv_id] >= max_turns:
# Conversation has no more turns (no longer active)
# save the updated conversation (with the LLM server's answer)
conv_queue.put((conv_id, active_convs.pop(conv_id)))
if args.verbose:
logger.info(
f"{Color.GREEN}Client {client_id} finished "
f"conversation ID {conv_id}{Color.RESET}"
)
else:
# Conversation is not finished, insert it at the back of the queue
conv_id_queue.appendleft(conv_id)
# Sleep between requests (if lambda is positive)
if args.request_rate > 0:
await poisson_sleep(args.request_rate, args.verbose)
# Send indication that the client is done
conv_queue.put((TERM_SIGNAL, TERM_SIGNAL))
logger.info(
f"{Color.CYAN}Client {client_id} is done "
f"({num_successes=}, {num_failures=}){Color.RESET}"
)
def worker_function(
client_id: int,
tokenizer: AutoTokenizer,
client_args: ClientArgs,
req_args: RequestArgs,
stop_event: mp.Event, # type: ignore
task_queue: mp.Queue,
result_queue: mp.Queue,
conv_queue: mp.Queue,
) -> None:
asyncio.run(
client_main(
client_args,
req_args,
client_id,
tokenizer,
stop_event,
task_queue,
result_queue,
conv_queue,
)
)
def get_client_config(
args: argparse.Namespace, input_conv: ConversationsMap
) -> tuple[ClientArgs, RequestArgs]:
if args.num_clients < 1:
raise ValueError("Number of clients must be a positive number")
if len(input_conv) < args.num_clients:
raise ValueError(
"Number of conversations must be equal or larger than the number of clients"
)
max_req_per_client: int | None = None
if args.max_num_requests is not None:
# Max number of requests per client
req_per_client = args.max_num_requests // args.num_clients
if req_per_client < 1:
raise ValueError("Number of requests should be at least one per client")
max_req_per_client = req_per_client
max_active_conversations = args.max_active_conversations
if max_active_conversations is None:
# Each client will have only one active conversation at a time
max_active_conversations = args.num_clients
if max_active_conversations > len(input_conv):
raise ValueError(
f"Max active conversations {max_active_conversations} "
"must be equal or less than the total number of conversations"
)
# Max number of active conversations per client
max_active_conv_per_client = max_active_conversations // args.num_clients
if max_active_conv_per_client < 1:
raise ValueError(
f"Max active conversations {max_active_conversations} "
"must be equal or greater than the number of clients"
)
# Skip the first user turn (as part of the warmup)
skip_first_turn = args.warmup_step
# Common arguments for all clients
client_args = ClientArgs(
seed=args.seed,
max_num_requests=max_req_per_client,
skip_first_turn=skip_first_turn,
max_turns=args.max_turns,
max_active_conversations=max_active_conv_per_client,
verbose=args.verbose,
print_content=args.print_content,
verify_output=args.verify_output,
conversation_sampling=args.conversation_sampling,
request_rate=args.request_rate,
max_retries=args.max_retries,
)
if args.limit_min_tokens > 0 or args.limit_max_tokens > 0:
if args.limit_min_tokens < 1 or args.limit_max_tokens < 1:
raise ValueError(
"Invalid min/max tokens limits (both limits should be provided)"
)
if args.limit_min_tokens > args.limit_max_tokens:
raise ValueError(
"Invalid min/max tokens limits (min should not be larger than max)"
)
if args.request_timeout_sec <= 0:
raise ValueError("Request timeout must be a positive number")
# Arguments for API requests
chat_url = f"{args.url}/v1/chat/completions"
model_name = args.served_model_name if args.served_model_name else args.model
req_args = RequestArgs(
chat_url=chat_url,
model=model_name,
stream=not args.no_stream,
limit_min_tokens=args.limit_min_tokens,
limit_max_tokens=args.limit_max_tokens,
timeout_sec=args.request_timeout_sec,
)
return client_args, req_args
async def main_mp(
client_args: ClientArgs,
req_args: RequestArgs,
bench_args: BenchmarkArgs,
tokenizer: AutoTokenizer,
input_conv: ConversationsMap,
) -> tuple[ConversationsMap, list[RequestStats]]:
# An event that will trigger graceful termination of all the clients
stop_event = mp.Event()
# Queue for input conversations (from the input file/dataset)
task_queue: mp.Queue = mp.Queue()
# Queue for client measurements (TTFT, TPOT, etc. for each request)
result_queue: mp.Queue = mp.Queue()
# Queue for output conversations (with the LLM answers, sent by the server)
conv_queue: mp.Queue = mp.Queue()
output_conv: ConversationsMap = {}
client_metrics: list[RequestStats] = []
# Start all clients
start_time = time.perf_counter_ns()
logger.info(f"{Color.GREEN}Starting {bench_args.num_clients} clients{Color.RESET}")
clients = []
for client_id in range(bench_args.num_clients):
client = mp.Process(
name=f"client_{client_id}",
target=worker_function,
args=(
client_id,
tokenizer,
client_args,
req_args,
stop_event,
task_queue,
result_queue,
conv_queue,
),
)
clients.append(client)
client.start()
# Submit all the input conversations as tasks for the clients
for conv_id, messages in input_conv.items():
task_queue.put((conv_id, messages))
# Add termination signals for clients
for _ in range(bench_args.num_clients):
task_queue.put((TERM_SIGNAL, TERM_SIGNAL))
# Collect the updated conversations from all clients
num_clients_finished = 0
total_convs = len(input_conv)
debug_stats = DebugStats(logger, min(15 * bench_args.num_clients, 500))
while num_clients_finished < bench_args.num_clients:
# Collect updated conversation
conv_id, messages = conv_queue.get()
# Collect results (measurements)
while not result_queue.empty():
new_data = result_queue.get()
client_metrics.append(new_data)
debug_stats.update(new_data)
if conv_id is TERM_SIGNAL:
num_clients_finished += 1
logger.info(
f"{Color.CYAN}{num_clients_finished} out of "
f"{bench_args.num_clients} clients finished{Color.RESET}"
)
if bench_args.early_stop and not stop_event.is_set():
# Once one client finished, stop all other clients.
# there is no reason to continue the benchmark with fewer clients.
logger.info(
f"{Color.YELLOW}Sending termination signal to clients{Color.RESET}"
)
stop_event.set()
else:
output_conv[conv_id] = messages
finished_convs = len(output_conv)
percent = finished_convs / total_convs
# Tuned to control the print rate (can be changed if required)
print_cycle = max(3, int(bench_args.num_clients / 4))
if finished_convs % print_cycle == 0:
runtime_sec = nanosec_to_sec(time.perf_counter_ns() - start_time)
logger.info(
f"{Color.CYAN}Finished {finished_convs} out of {total_convs} conversations ({percent:.0%}), " # noqa: E501
f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501
)
rps: str | float = round(len(client_metrics) / runtime_sec, 3)
if len(client_metrics) < (5 * bench_args.num_clients):
# Do not estimate the RPS if the number of samples is very low
# (threshold can be tuned if needed)
rps = "N/A"
runtime_left_sec: str | float = round(
(runtime_sec / finished_convs) * (total_convs - finished_convs), 3
)
if percent < 0.05:
# If less than 5% of the conversations were not finished,
# the estimation will probably be very inaccurate
# (threshold can be tuned if needed).
runtime_left_sec = "N/A"
logger.info(
f"{Color.CYAN}Estimated req/sec {rps}, estimated runtime left {runtime_left_sec} sec{Color.RESET}" # noqa: E501
)
debug_stats.print()
logger.info(
f"{Color.CYAN}All {bench_args.num_clients} clients finished{Color.RESET}"
)
# At this point all the clients finished,
# collect results (TTFT, TPOT, etc.) from all the clients.
# This needs to happen before calling join on the clients
# (result_queue should be emptied).
while not result_queue.empty():
client_metrics.append(result_queue.get())
logger.info(f"Collected {len(client_metrics)} samples from all the clients")
# Wait for all clients to finish
for client in clients:
logger.info(
f"{Color.CYAN}Waiting for client {client.name} "
f"(is alive: {client.is_alive()}){Color.RESET}"
)
client.join(timeout=req_args.timeout_sec + 1)
if client.is_alive():
logger.warning(
f"{Color.YELLOW}Client {client.name} will be terminated{Color.RESET}"
)
client.terminate()
exitcode = client.exitcode
if exitcode != 0:
logger.error(
f"{Color.RED}Client {client.name} exited "
f"with exit code {exitcode}{Color.RESET}"
)
logger.info(
f"All {bench_args.num_clients} clients exited (successfully "
f"finished {len(output_conv)} out of {total_convs} conversations)"
)
# Queues should be closed, required to avoid hang at interpreter shutdown
unfinished_tasks = 0
while not task_queue.empty():
task_queue.get()
unfinished_tasks += 1
if unfinished_tasks > 0:
# Can happen if not all tasks (conversations) have finished.
# May happen if --max-num-requests was used,
# or if an error occurred in one of the clients.
logger.debug(f"Discarding {unfinished_tasks} unfinished tasks")
task_queue.close()
task_queue.join_thread()
result_queue.close()
result_queue.join_thread()
conv_queue.close()
conv_queue.join_thread()
return output_conv, client_metrics
def get_filename_with_timestamp(label: str, extension: str) -> str:
time_now = datetime.now()
timestamp = time_now.strftime("%d-%m-%Y_%H-%M-%S")
filename = f"{label}__{timestamp}.{extension}"
return filename
def process_statistics(
client_metrics: list[RequestStats],
warmup_percentages: list[float],
test_params: dict,
verbose: bool,
gen_conv_args: GenConvArgs | None = None,
excel_output: bool = False,
warmup_runtime_sec: float | None = None,
) -> None:
if len(client_metrics) == 0:
logger.info("No samples to process")
return
logger.info(f"Processing {len(client_metrics)} samples...")
raw_data = pd.DataFrame(client_metrics)
if verbose:
# Calculate the time between user turns in each conversation (in a new column)
raw_data = raw_data.sort_values(by=["conversation_id", "start_time_ms"])
raw_data["time_between_user_turns_sec"] = raw_data.groupby("conversation_id")[
"start_time_ms"
].diff()
# Convert milliseconds to seconds
raw_data["time_between_user_turns_sec"] = (
raw_data["time_between_user_turns_sec"] / 1000.0
)
# Final raw data should be sorted by time
raw_data = raw_data.sort_values(by=["start_time_ms"])
raw_data["end_time_ms"] = raw_data["start_time_ms"] + raw_data["latency_ms"]
percentiles = [0.25, 0.5, 0.75, 0.9]
# Add more percentiles if there are enough samples
if len(raw_data) >= 100:
percentiles.append(0.99)
if len(raw_data) >= 1000:
percentiles.append(0.999)
if len(raw_data) >= 10000:
percentiles.append(0.9999)
# Set precision for numbers in the output text (the dataframes)
pd.set_option("display.precision", 2)
# Exclude parameters from RequestStats
exclude = [
"start_time_ms",
"end_time_ms",
"output_num_first_chunk_tokens",
"approx_cached_percent",
"conversation_id",
"client_id",
]
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Parameters:{Color.RESET}")
for k, v in test_params.items():
print(f"{k}={v}")
# conversations generation parameters
if gen_conv_args is not None:
gen_params = {
"text_files": ", ".join(gen_conv_args.text_files),
"input_num_turns": str(gen_conv_args.input_num_turns),
"input_common_prefix_num_tokens": str(
gen_conv_args.input_common_prefix_num_tokens
),
"input_prefix_num_tokens": str(gen_conv_args.input_prefix_num_tokens),
"input_num_tokens": str(gen_conv_args.input_num_tokens),
"output_num_tokens": str(gen_conv_args.output_num_tokens),
}
print(f"{Color.YELLOW}Conversations Generation Parameters:{Color.RESET}")
for k, v in gen_params.items():
print(f"{k}={v}")
print(TEXT_SEPARATOR)
params_list = []
df_list = []
for percent in warmup_percentages:
# Select samples from the end (tail) of the dataframe
warmup_count = int(percent * len(raw_data))
tail_count = len(raw_data) - warmup_count
if tail_count == 0:
# No reason to process if the count of samples is zero
break
df = raw_data.tail(tail_count)
# Runtime is the diff between the end of the last request
# and the start of the first request
runtime_sec = df["end_time_ms"].iloc[-1] - df["start_time_ms"].iloc[0]
# Convert milliseconds to seconds
runtime_sec = runtime_sec / 1000.0
requests_per_sec = float(len(df)) / runtime_sec
params = {
"runtime_sec": runtime_sec,
"requests_per_sec": requests_per_sec,
}
if warmup_runtime_sec is not None:
params["warmup_runtime_sec"] = warmup_runtime_sec
params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
# Generate a summary of relevant metrics (and drop irrelevant data)
df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
# List for Excel file
params_list.append(params)
df_list.append(df)
# Print the statistics summary
if percent > 0 or len(warmup_percentages) > 1:
print(
f"{Color.YELLOW}Statistics summary "
f"(assuming {percent:.0%} warmup samples):{Color.RESET}"
)
else:
print(f"{Color.YELLOW}Statistics summary:{Color.RESET}")
for k, v in params.items():
if isinstance(v, float):
print(f"{k} = {v:.3f}")
else:
print(f"{k} = {v}")
print(TEXT_SEPARATOR)
print(df)
print(TEXT_SEPARATOR)
if excel_output:
prefix = f"statistics_{test_params['num_clients']}_clients"
filename = get_filename_with_timestamp(prefix, "xlsx")
with pd.ExcelWriter(filename, engine="xlsxwriter") as writer:
startrow = 0
test_params_df = pd.DataFrame([test_params])
test_params_df.to_excel(
writer, sheet_name="Summary", index=False, startrow=startrow
)
startrow += len(test_params_df) + 3
if gen_conv_args is not None:
gen_params_df = pd.DataFrame([gen_params])
gen_params_df.to_excel(
writer, sheet_name="Summary", index=False, startrow=(startrow - 1)
)
startrow += len(gen_params_df) + 3
for params, df_stats in zip(params_list, df_list):
df_params = pd.DataFrame([params])
df_params.to_excel(
writer, sheet_name="Summary", index=False, startrow=startrow
)
startrow += len(df_params) + 2
df_stats.to_excel(
writer, sheet_name="Summary", index=True, startrow=startrow
)
startrow += len(df_stats) + 3
raw_data.to_excel(writer, sheet_name="Raw data", index=False, startrow=0)
logger.info(
f"{Color.GREEN}Client metrics exported to file: {filename}{Color.RESET}"
)
async def get_server_info(url: str) -> None:
logger.info(f"{Color.BLUE}Collecting information from server: {url}{Color.RESET}")
async with aiohttp.ClientSession() as session:
# Get server version (not mandatory, "version" endpoint may not exist)
url_version = f"{url}/version"
async with session.get(url_version) as response:
if HTTPStatus(response.status) == HTTPStatus.OK:
text = await response.text()
logger.info(f"{Color.BLUE}Server version: {text}{Color.RESET}")
# Get available models
url_models = f"{url}/v1/models"
async with session.get(url_models) as response:
if HTTPStatus(response.status) == HTTPStatus.OK:
text = await response.text()
logger.info(f"{Color.BLUE}Models:{Color.RESET}")
models_data = json.loads(text)
models_list = models_data["data"]
for model in models_list:
model_id = model["id"]
max_model_len = model.get("max_model_len", "N/A")
logger.info(
f"{Color.BLUE}\t{model_id=}, {max_model_len=}{Color.RESET}"
)
else:
logger.info(f"{Color.RED}Failed to get models{Color.RESET}")
async def main() -> None:
parser = argparse.ArgumentParser(
prog="Benchmark serving with multi-turn conversations",
description="Benchmark online inference using REST API",
)
parser.add_argument("--version", action="version", version="%(prog)s 1.0")
parser.add_argument(
"-i",
"--input-file",
type=str,
required=True,
help="Input JSON file with ShareGPT conversations or "
"configuration file for generation of synthetic conversations",
)
parser.add_argument(
"-o",
"--output-file",
type=str,
default=None,
help="Output JSON file containing conversations with updated assistant answers",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Seed for random number generators (default: 0)",
)
parser.add_argument(
"-m", "--model", type=str, required=True, help="Path of the LLM model"
)
parser.add_argument(
"--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the `--model` argument. ",
)
parser.add_argument(
"-u",
"--url",
type=str,
default="http://localhost:8000",
help="Base URL for the LLM API server",
)
parser.add_argument(
"-p",
"--num-clients",
type=int,
default=1,
help="Number of clients that will send requests in parallel",
)
parser.add_argument(
"-k",
"--max-active-conversations",
type=int,
default=None,
help="Max number of active conversations at a time (for all clients)",
)
parser.add_argument(
"-n",
"--max-num-requests",
type=int,
default=None,
help="Max number of requests to send (total for all clients)",
)
parser.add_argument(
"--warmup-step",
default=False,
action="store_true",
help="Run a warmup step (using only the first turn of every conversation), "
"measurements will not be included in the final benchmark results",
)
parser.add_argument(
"--max-turns",
type=int,
default=None,
help="Maximum number of turns/messages per conversation, "
"includes both user and assistant messages "
"(a positive number, e.g: 2, 4, 6, etc.), disabled by default",
)
parser.add_argument(
"--no-early-stop",
default=False,
action="store_true",
help="By default, the benchmark will stop if at least one client exits."
" Use this flag to disable this behavior",
)
parser.add_argument(
"--limit-max-tokens",
type=int,
default=NUM_TOKENS_FROM_DATASET,
help="Set max_tokens for the output token count of each request "
"(must also set --limit-min-tokens). "
"Overrides output token count from the input dataset. "
"Use a negative value to disable this limit.",
)
parser.add_argument(
"--limit-min-tokens",
type=int,
default=NUM_TOKENS_FROM_DATASET,
help="Set min_tokens for the output token count of each request "
"(must also set --limit-max-tokens). "
"Overrides output token count from the input dataset. "
"Use a negative value to disable this limit.",
)
parser.add_argument(
"--request-rate",
type=float,
default=0,
help="Expected request rate (Poisson process) per client in requests/sec."
"Set to 0 for no delay between requests.",
)
parser.add_argument(
"--max-retries",
type=int,
default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")),
help="Maximum number of retry attempts for timed-out requests. "
"Default is 0 (no retries). "
"Set to higher values to retry failed requests and maintain "
"fair workload distribution. "
"Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.",
)
parser.add_argument(
"--conversation-sampling",
type=ConversationSampling,
choices=list(ConversationSampling),
default=ConversationSampling.ROUND_ROBIN,
help=(
"Strategy for selecting which conversation to use for the next request. "
"Options: 'round_robin' (cycle through conversations), "
"'random' (pick randomly)."
),
)
parser.add_argument(
"--verify-output",
default=False,
action="store_true",
help="Verify the LLM output (compare to the answers in the input JSON file)",
)
parser.add_argument(
"--request-timeout-sec",
type=int,
default=120,
help="Timeout in seconds for each API request (default: 120). "
"Automatically increased if max tokens imply longer decoding.",
)
parser.add_argument(
"--no-stream",
default=False,
action="store_true",
help="Disable stream/streaming mode (set 'stream' to False in the API request)",
)
parser.add_argument(
"-e",
"--excel-output",
default=False,
action="store_true",
help="Export summary to Excel file (optional)",
)
parser.add_argument(
"-v",
"--verbose",
default=False,
action="store_true",
help="Enable verbose output",
)
parser.add_argument(
"--print-content",
default=False,
action="store_true",
help="Print the user prompts and the server's answers",
)
parser.add_argument(
"--warmup-percentages",
type=str,
default="0%",
help="Ignore the first X samples as warmup (X is a percentage)."
" A comma separated list of percentages can be used "
"(for example: --warmup-percentages=0%%,50%%)",
)
args = parser.parse_args()
logger.info(args)
logger.info(f"{Color.GREEN}Input parameters:{Color.RESET}")
logger.info(f"url={args.url}")
logger.info(f"model={args.model}")
logger.info(f"num_clients={args.num_clients}")
if args.verify_output:
logger.info(f"{Color.PURPLE}Verify is enabled{Color.RESET}")
# Calculate the amount of samples to filter (as warmup samples/measurements).
try:
warmup_percentages: list[float] = [0.0]
if not args.warmup_step:
# Warmup percentage can be used only if the warmup step was used
warmup_strings: list[str] = args.warmup_percentages.split(",")
warmup_strings = [x.replace("%", "") for x in warmup_strings]
warmup_percentages = [float(x) / 100 for x in warmup_strings]
# Check for valid range (0 to 1)
for p in warmup_percentages:
assert p >= 0.0 and p < 1.0
# Sort from high to low warmup percentage
warmup_percentages.sort()
logger.info(
f"Warmup percentages (percentage of samples): {warmup_percentages}"
)
except Exception:
raise ValueError(
f"Invalid --warmup-percentage={args.warmup_percentage}"
) from None
# Set global seeds for main process
random.seed(args.seed)
np.random.seed(args.seed)
logger.info("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(args.model)
await get_server_info(args.url)
# Load the input file (either conversations of configuration file)
logger.info(f"Reading input file: {args.input_file}")
with open(args.input_file) as f:
input_data = json.load(f)
gen_conv_args = None
if isinstance(input_data, list):
# The conversations are stored as a list of dicts
logger.info(f"Found {len(input_data)} items in the input file")
# Convert the list to a ConversationsMap
conversations = conversations_list_to_dict(input_data)
elif isinstance(input_data, dict):
# The input file is a configuration file
# (type is determined by the field 'filetype')
if "filetype" not in input_data:
raise Exception(
f"Input file {args.input_file} is invalid (missing 'filetype')"
)
logger.info(f"Using input file with filetype: {input_data['filetype']}")
gen_conv_args = parse_input_json_file(input_data)
# Disable warning from "huggingface/tokenizers"
# (when using python multiprocessing and tokenizers)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Generate synthetic conversations
conversations = generate_conversations(gen_conv_args, tokenizer)
else:
raise Exception(f"Input file {args.input_file} is invalid")
if args.max_turns is not None:
if args.max_turns < 1:
raise ValueError("Max turns must be a positive number")
logger.info(
f"{Color.PURPLE}Max turns per conversation "
f"is limited to {args.max_turns}{Color.RESET}"
)
# Create benchmark configurations
client_args, req_args = get_client_config(args, conversations)
bench_args = BenchmarkArgs(
url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
)
warmup_runtime_sec: float | None = None
# Warm-up step
if args.warmup_step:
# Only send a single user prompt from every conversation.
# max_active_conversations must be 1,
# otherwise the clients may exit after sending a single request
# (because the task queue is empty).
warmup_client_args = client_args._replace(
skip_first_turn=False, max_turns=1, max_active_conversations=1
)
# Early stop should be disabled,
# all clients should finish their work before exiting
warmup_bench_args = bench_args._replace(early_stop=False)
logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
warmup_start_ns = time.perf_counter_ns()
conversations, _ = await main_mp(
warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
)
warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.PURPLE,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
# Run the benchmark
benchmark_start_ns = time.perf_counter_ns()
client_convs, client_metrics = await main_mp(
client_args, req_args, bench_args, tokenizer, conversations
)
benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
# Calculate requests per second
requests_per_sec = len(client_metrics) / benchmark_runtime_sec
benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
logger.info(
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
"requests per second: %.3f%s",
Color.GREEN,
benchmark_runtime_sec,
benchmark_runtime_ms,
requests_per_sec,
Color.RESET,
)
if warmup_runtime_sec is not None:
total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
logger.info(
"%sWarmup runtime: %.3f sec (%.3f ms)%s",
Color.GREEN,
warmup_runtime_sec,
warmup_runtime_sec * 1000,
Color.RESET,
)
logger.info(
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
Color.GREEN,
total_runtime_sec,
total_runtime_sec * 1000,
Color.RESET,
)
# Benchmark parameters
params = {
"model": args.model,
"num_clients": args.num_clients,
"num_conversations": len(conversations),
"active_conversations": args.max_active_conversations,
"seed": args.seed,
}
if args.limit_min_tokens > 0:
params["min_tokens"] = args.limit_min_tokens
if args.limit_max_tokens > 0:
params["max_tokens"] = args.limit_max_tokens
# Process and print statistics (and save excel file with the statistics)
process_statistics(
client_metrics,
test_params=params,
warmup_percentages=warmup_percentages,
verbose=args.verbose,
gen_conv_args=gen_conv_args,
excel_output=args.excel_output,
warmup_runtime_sec=warmup_runtime_sec,
)
if args.output_file is not None:
# Write a JSON file with the updated conversations
# The "assistant" content will contain the answers from the tested LLM
output_data: ShareGptConversations = conversations_dict_to_list(client_convs)
logger.info(
f"{Color.GREEN}Writing conversations file: {args.output_file}{Color.RESET}"
)
with open(args.output_file, "w") as f:
json.dump(output_data, f, indent=4)
if __name__ == "__main__":
asyncio.run(main())
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Download dataset from:
https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
Convert to OpenAI API:
export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
"""
import argparse
import json
import random
from statistics import mean
from typing import Any
import pandas as pd # type: ignore
import tqdm # type: ignore
from transformers import AutoTokenizer # type: ignore
def has_non_english_chars(text: str) -> bool:
return not text.isascii()
def content_is_valid(
content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
if max_content_len and len(content) > max_content_len:
return False
return has_non_english_chars(content)
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
# Collect statistics
stats = []
print("\nCollecting statistics...")
for item in tqdm.tqdm(conversations):
# item has "id" and "messages"
messages = item["messages"]
user_turns = 0
assistant_turns = 0
user_words = 0
assistant_words = 0
conv_chars = 0
user_tokens: list[int] = []
assistant_tokens: list[int] = []
for m in messages:
content = m["content"]
conv_chars += len(content)
content_num_words = content.count(" ") + 1
num_tokens = 0
if tokenizer:
num_tokens = len(tokenizer(m["content"]).input_ids)
if m["role"] == "user":
user_turns += 1
user_words += content_num_words
if tokenizer:
user_tokens.append(num_tokens)
elif m["role"] == "assistant":
assistant_turns += 1
assistant_words += content_num_words
if tokenizer:
assistant_tokens.append(num_tokens)
# assert user_turns == assistant_turns, \
# f"Invalid conversation ID {item['id']}"
conv_words = user_words + assistant_words
item_stats = {
"user_turns": user_turns,
"assistant_turns": assistant_turns,
"user_words": user_words,
"assistant_words": assistant_words,
"conv_turns": len(messages),
"conv_words": conv_words,
"conv_characters": conv_chars,
}
if len(user_tokens) > 0:
item_stats["user_tokens"] = int(mean(user_tokens))
if len(assistant_tokens) > 0:
item_stats["assistant_tokens"] = int(mean(assistant_tokens))
stats.append(item_stats)
print("\nStatistics:")
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
df = pd.DataFrame(stats)
print(df.describe(percentiles=percentiles).transpose())
def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: int | None,
min_content_len: int | None = None,
max_content_len: int | None = None,
min_turns: int | None = None,
max_turns: int | None = None,
model: str | None = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns
if min_content_len and max_content_len:
# Verify that min is not larger than max if both were given
assert min_content_len <= max_content_len
print(
f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
)
random.seed(seed)
tokenizer = None
if model is not None:
print(f"Loading tokenizer from: {model}")
tokenizer = AutoTokenizer.from_pretrained(model)
# Read the ShareGPT JSON file
print(f"Reading file: {input_file}")
with open(input_file, encoding="utf-8") as f:
# Should be a list of dicts
# Each dict should have "id" (string) and "conversations" (list of dicts)
sharegpt_data = json.load(f)
assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
print(f"Total items in input file: {len(sharegpt_data):,}")
print(f"Shuffling dataset with seed {seed}")
random.shuffle(sharegpt_data)
# Map conversation ID to the all the messages
conversation_parts: dict[str, list[Any]] = {}
for item in tqdm.tqdm(sharegpt_data):
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
# Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
conv_id, _ = item["id"].split("_")
new_turns = item["conversations"]
if conv_id not in conversation_parts:
# Start new conversation
conversation_parts[conv_id] = []
elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
prev_turns = conversation_parts[conv_id][-1]
if prev_turns[-1]["from"] == new_turns[0]["from"]:
new_turns = new_turns[1:]
if len(new_turns) > 0:
# We assume that parts are in order in the ShareGPT dataset
conversation_parts[conv_id].append(new_turns)
dataset: list[dict[str, Any]] = []
for conv_id, conv_parts in conversation_parts.items():
new_item = {"id": conv_id}
conversations: list[dict[str, str]] = []
# Merge all parts
for conv_part in conv_parts:
conversations.extend(conv_part)
if len(conversations) > 0:
new_item["conversations"] = conversations
dataset.append(new_item)
print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
# Final output data
final_openai_dataset: list[dict] = []
# Filter conversations from the ShareGPT dataset and convert to OpenAI format
for item in tqdm.tqdm(dataset):
messages: list[dict] = []
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
conv_id = item["id"]
conversations = item["conversations"]
if min_turns is not None and len(conversations) < min_turns:
# Skip short conversations
continue
# Convert each message in the conversation, up to max_turns if specified
for i, turn in enumerate(conversations):
assert "from" in turn and "value" in turn, (
f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
)
role = None
turn_from = turn["from"]
if turn_from in {"human", "user"}:
role = "user"
elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
role = "assistant"
elif turn_from == "system":
role = "system"
assert role is not None, (
f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
)
if i == 0 and role != "user":
# If the first message is from assistant (gpt), skip it.
# this happens when the conversation is a follow-up
# to a previous conversation (from the same user).
continue
if max_turns is not None and i >= max_turns:
break
# Convert message to OpenAI format (with "role" and "content")
content = turn["value"]
messages.append({"role": role, "content": content})
# Add the converted conversation to the OpenAI format
if len(messages) > 0:
valid_messages = True
# First turn should always be from the user
user_turn = True
for m in messages:
# Make sure that turns alternate between user and assistant
if (user_turn and m["role"] != "user") or (
not user_turn and m["role"] != "assistant"
):
valid_messages = False
break
user_turn = not user_turn
content = m["content"]
valid_messages = content_is_valid(
content, min_content_len, max_content_len
)
if not valid_messages:
break
if valid_messages is True:
final_openai_dataset.append({"id": conv_id, "messages": messages})
assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
print_stats(final_openai_dataset)
print_stats_again = False
if max_items is not None and len(final_openai_dataset) > max_items:
print(f"\n\nSampling {max_items} items from the dataset...")
print_stats_again = True
final_openai_dataset = random.sample(final_openai_dataset, max_items)
if print_stats_again:
# Print stats after the dataset changed
print_stats(final_openai_dataset, tokenizer)
# Write the converted data to a new JSON file
final_size = len(final_openai_dataset)
print(f"\nTotal conversations converted (after filtering): {final_size:,}")
print(f"\nWriting file: {output_file}")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
def main() -> None:
parser = argparse.ArgumentParser(
description="Convert ShareGPT dataset to OpenAI API format"
)
parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
parser.add_argument(
"output_file", help="Path to the output OpenAI format JSON file"
)
parser.add_argument(
"--seed", type=int, default=0, help="Seed for random number generators"
)
parser.add_argument(
"--max-items",
type=int,
default=None,
help="Maximum number of items in the output file",
)
parser.add_argument(
"--min-turns",
type=int,
default=None,
help="Minimum number of turns per conversation",
)
parser.add_argument(
"--max-turns",
type=int,
default=None,
help="Maximum number of turns per conversation",
)
parser.add_argument(
"--min-content-len",
type=int,
default=None,
help="Min number of characters in the messages' content",
)
parser.add_argument(
"--max-content-len",
type=int,
default=None,
help="Max number of characters in the messages' content",
)
parser.add_argument(
"--model",
type=str,
default=None,
help="LLM model, only the tokenizer will be used",
)
args = parser.parse_args()
convert_sharegpt_to_openai(
args.seed,
args.input_file,
args.output_file,
args.max_items,
args.min_content_len,
args.max_content_len,
args.min_turns,
args.max_turns,
args.model,
)
if __name__ == "__main__":
main()
numpy>=1.24
pandas>=2.0.0
aiohttp>=3.10
transformers>=4.46
xlsxwriter>=3.2.1
tqdm>=4.66
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile
import pstats
from vllm import LLM, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
# A very long prompt, total number of tokens is about 15k.
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
LONG_PROMPT = " ".join(LONG_PROMPT)
def main(args):
llm = LLM(
model=args.model,
enforce_eager=True,
enable_prefix_caching=True,
tensor_parallel_size=args.tensor_parallel_size,
)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
profiler = cProfile.Profile()
print("------warm up------")
for i in range(3):
output = llm.generate(LONG_PROMPT, sampling_params)
print(output[0].outputs[0].text)
print("------start generating------")
for i in range(3):
profiler.runctx(
"llm.generate(LONG_PROMPT, sampling_params)", globals(), locals()
)
# analyze the runtime of hashing function
stats = pstats.Stats(profiler)
stats.sort_stats("cumulative")
total_time = 0
total_calls = 0
for func in stats.stats:
if "hash_of_block" in func[2]:
total_time = stats.stats[func][3]
total_calls = stats.stats[func][0]
percentage = (total_time / stats.total_tt) * 100
print(
f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime."
)
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the performance of hashing function in"
"automatic prefix caching."
)
parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k")
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--output-len", type=int, default=10)
parser.add_argument(
"--enable-prefix-caching", action="store_true", help="enable prefix caching"
)
args = parser.parse_args()
main(args)
#!/bin/bash
# default values
MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"}
BACKEND=${BACKEND:-"vllm"}
DATASET=${DATASET:-"xgrammar_bench"}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"}
PORT=${PORT:-8000}
STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1}
TOTAL_SECONDS=${TOTAL_SECONDS:-90}
MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300}
TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"}
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " --model MODEL Model to benchmark (default: $MODEL)"
echo " --backend BACKEND Backend to use (default: $BACKEND)"
echo " --dataset DATASET Dataset to use (default: $DATASET)"
echo " --max-new-tokens N Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)"
echo " --output-dir DIR Output directory for results (default: $OUTPUT_DIR)"
echo " --port PORT Port to use (default: $PORT)"
echo " --structured-output-ratio N Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)"
echo " --tokenizer-mode MODE Tokenizer mode to use (default: $TOKENIZER_MODE)"
echo " --total-seconds N Total seconds to run the benchmark (default: $TOTAL_SECONDS)"
echo " -h, --help Show this help message and exit"
exit 0
}
# parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
--backend)
BACKEND="$2"
shift 2
;;
--dataset)
DATASET="$2"
shift 2
;;
--max-new-tokens)
MAX_NEW_TOKENS="$2"
shift 2
;;
--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
--port)
PORT="$2"
shift 2
;;
--structured-output-ratio)
STRUCTURED_OUTPUT_RATIO="$2"
shift 2
;;
--tokenizer-mode)
TOKENIZER_MODE="$2"
shift 2
;;
--total-seconds)
TOTAL_SECONDS="$2"
shift 2
;;
-h|--help)
usage
;;
*)
printf "Unknown argument: %s\n" "$1"
usage
;;
esac
done
# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Define QPS values to test
QPS_VALUES=(25 20 15 10 5 1)
# Common parameters
COMMON_PARAMS=(
--backend "$BACKEND"
--model "$MODEL"
--dataset "$DATASET"
--structured-output-ratio "$STRUCTURED_OUTPUT_RATIO"
--save-results
--result-dir "$OUTPUT_DIR"
--output-len "$MAX_NEW_TOKENS"
--port "$PORT"
--tokenizer-mode "$TOKENIZER_MODE"
)
echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND"
echo "Dataset: $DATASET"
echo "Results will be saved to: $OUTPUT_DIR"
echo "----------------------------------------"
# Run benchmarks with different QPS values
for qps in "${QPS_VALUES[@]}"; do
echo "Running benchmark with QPS: $qps"
# Get git hash and branch for the filename
GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run
FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json"
NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part
echo "Running benchmark with $NUM_PROMPTS prompts"
# Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \
--request-rate "$qps" \
--result-filename "$FILENAME" \
--num-prompts "$NUM_PROMPTS"
echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------"
done
echo "All benchmarks completed!"
echo "Results saved to: $OUTPUT_DIR"
FROM fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed'st thy light'st flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel.
Thou that art now the world's fresh ornament
And only herald to the gaudy spring,
Within thine own bud buriest thy content
And, tender churl, makest waste in niggarding.
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.
When forty winters shall beseige thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery, so gazed on now,
Will be a tatter'd weed, of small worth held:
Then being ask'd where all thy beauty lies,
Where all the treasure of thy lusty days,
To say, within thine own deep-sunken eyes,
Were an all-eating shame and thriftless praise.
How much more praise deserved thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.
Look in thy glass, and tell the face thou viewest
Now is the time that face should form another;
Whose fresh repair if now thou not renewest,
Thou dost beguile the world, unbless some mother.
For where is she so fair whose unear'd womb
Disdains the tillage of thy husbandry?
Or who is he so fond will be the tomb
Of his self-love, to stop posterity?
Thou art thy mother's glass, and she in thee
Calls back the lovely April of her prime:
So thou through windows of thine age shall see
Despite of wrinkles this thy golden time.
But if thou live, remember'd not to be,
Die single, and thine image dies with thee.
Unthrifty loveliness, why dost thou spend
Upon thyself thy beauty's legacy?
Nature's bequest gives nothing but doth lend,
And being frank she lends to those are free.
Then, beauteous niggard, why dost thou abuse
The bounteous largess given thee to give?
Profitless usurer, why dost thou use
So great a sum of sums, yet canst not live?
For having traffic with thyself alone,
Thou of thyself thy sweet self dost deceive.
Then how, when nature calls thee to be gone,
What acceptable audit canst thou leave?
Thy unused beauty must be tomb'd with thee,
Which, used, lives th' executor to be.
Those hours, that with gentle work did frame
The lovely gaze where every eye doth dwell,
Will play the tyrants to the very same
And that unfair which fairly doth excel:
For never-resting time leads summer on
To hideous winter and confounds him there;
Sap cheque'd with frost and lusty leaves quite gone,
Beauty o'ersnow'd and bareness every where:
Then, were not summer's distillation left,
A liquid prisoner pent in walls of glass,
Beauty's effect with beauty were bereft,
Nor it nor no remembrance what it was:
But flowers distill'd though they with winter meet,
Leese but their show; their substance still lives sweet.
Then let not winter's ragged hand deface
In thee thy summer, ere thou be distill'd:
Make sweet some vial; treasure thou some place
With beauty's treasure, ere it be self-kill'd.
That use is not forbidden usury,
Which happies those that pay the willing loan;
That's for thyself to breed another thee,
Or ten times happier, be it ten for one;
Ten times thyself were happier than thou art,
If ten of thine ten times refigured thee:
Then what could death do, if thou shouldst depart,
Leaving thee living in posterity?
Be not self-will'd, for thou art much too fair
To be death's conquest and make worms thine heir.
Lo! in the orient when the gracious light
Lifts up his burning head, each under eye
Doth homage to his new-appearing sight,
Serving with looks his sacred majesty;
And having climb'd the steep-up heavenly hill,
Resembling strong youth in his middle age,
yet mortal looks adore his beauty still,
Attending on his golden pilgrimage;
But when from highmost pitch, with weary car,
Like feeble age, he reeleth from the day,
The eyes, 'fore duteous, now converted are
From his low tract and look another way:
So thou, thyself out-going in thy noon,
Unlook'd on diest, unless thou get a son.
Music to hear, why hear'st thou music sadly?
Sweets with sweets war not, joy delights in joy.
Why lovest thou that which thou receivest not gladly,
Or else receivest with pleasure thine annoy?
If the true concord of well-tuned sounds,
By unions married, do offend thine ear,
They do but sweetly chide thee, who confounds
In singleness the parts that thou shouldst bear.
Mark how one string, sweet husband to another,
Strikes each in each by mutual ordering,
Resembling sire and child and happy mother
Who all in one, one pleasing note do sing:
Whose speechless song, being many, seeming one,
Sings this to thee: 'thou single wilt prove none.'
Is it for fear to wet a widow's eye
That thou consumest thyself in single life?
Ah! if thou issueless shalt hap to die.
The world will wail thee, like a makeless wife;
The world will be thy widow and still weep
That thou no form of thee hast left behind,
When every private widow well may keep
By children's eyes her husband's shape in mind.
Look, what an unthrift in the world doth spend
Shifts but his place, for still the world enjoys it;
But beauty's waste hath in the world an end,
And kept unused, the user so destroys it.
No love toward others in that bosom sits
That on himself such murderous shame commits.
For shame! deny that thou bear'st love to any,
Who for thyself art so unprovident.
Grant, if thou wilt, thou art beloved of many,
But that thou none lovest is most evident;
For thou art so possess'd with murderous hate
That 'gainst thyself thou stick'st not to conspire.
Seeking that beauteous roof to ruinate
Which to repair should be thy chief desire.
O, change thy thought, that I may change my mind!
Shall hate be fairer lodged than gentle love?
Be, as thy presence is, gracious and kind,
Or to thyself at least kind-hearted prove:
Make thee another self, for love of me,
That beauty still may live in thine or thee.
As fast as thou shalt wane, so fast thou growest
In one of thine, from that which thou departest;
And that fresh blood which youngly thou bestowest
Thou mayst call thine when thou from youth convertest.
Herein lives wisdom, beauty and increase:
Without this, folly, age and cold decay:
If all were minded so, the times should cease
And threescore year would make the world away.
Let those whom Nature hath not made for store,
Harsh featureless and rude, barrenly perish:
Look, whom she best endow'd she gave the more;
Which bounteous gift thou shouldst in bounty cherish:
She carved thee for her seal, and meant thereby
Thou shouldst print more, not let that copy die.
When I do count the clock that tells the time,
And see the brave day sunk in hideous night;
When I behold the violet past prime,
And sable curls all silver'd o'er with white;
When lofty trees I see barren of leaves
Which erst from heat did canopy the herd,
And summer's green all girded up in sheaves
Borne on the bier with white and bristly beard,
Then of thy beauty do I question make,
That thou among the wastes of time must go,
Since sweets and beauties do themselves forsake
And die as fast as they see others grow;
And nothing 'gainst Time's scythe can make defence
Save breed, to brave him when he takes thee hence.
O, that you were yourself! but, love, you are
No longer yours than you yourself here live:
Against this coming end you should prepare,
And your sweet semblance to some other give.
So should that beauty which you hold in lease
Find no determination: then you were
Yourself again after yourself's decease,
When your sweet issue your sweet form should bear.
Who lets so fair a house fall to decay,
Which husbandry in honour might uphold
Against the stormy gusts of winter's day
And barren rage of death's eternal cold?
O, none but unthrifts! Dear my love, you know
You had a father: let your son say so.
Not from the stars do I my judgment pluck;
And yet methinks I have astronomy,
But not to tell of good or evil luck,
Of plagues, of dearths, or seasons' quality;
Nor can I fortune to brief minutes tell,
Pointing to each his thunder, rain and wind,
Or say with princes if it shall go well,
By oft predict that I in heaven find:
But from thine eyes my knowledge I derive,
And, constant stars, in them I read such art
As truth and beauty shall together thrive,
If from thyself to store thou wouldst convert;
Or else of thee this I prognosticate:
Thy end is truth's and beauty's doom and date.
When I consider every thing that grows
Holds in perfection but a little moment,
That this huge stage presenteth nought but shows
Whereon the stars in secret influence comment;
When I perceive that men as plants increase,
Cheered and cheque'd even by the self-same sky,
Vaunt in their youthful sap, at height decrease,
And wear their brave state out of memory;
Then the conceit of this inconstant stay
Sets you most rich in youth before my sight,
Where wasteful Time debateth with Decay,
To change your day of youth to sullied night;
And all in war with Time for love of you,
As he takes from you, I engraft you new.
But wherefore do not you a mightier way
Make war upon this bloody tyrant, Time?
And fortify yourself in your decay
With means more blessed than my barren rhyme?
Now stand you on the top of happy hours,
And many maiden gardens yet unset
With virtuous wish would bear your living flowers,
Much liker than your painted counterfeit:
So should the lines of life that life repair,
Which this, Time's pencil, or my pupil pen,
Neither in inward worth nor outward fair,
Can make you live yourself in eyes of men.
To give away yourself keeps yourself still,
And you must live, drawn by your own sweet skill.
Who will believe my verse in time to come,
If it were fill'd with your most high deserts?
Though yet, heaven knows, it is but as a tomb
Which hides your life and shows not half your parts.
If I could write the beauty of your eyes
And in fresh numbers number all your graces,
The age to come would say 'This poet lies:
Such heavenly touches ne'er touch'd earthly faces.'
So should my papers yellow'd with their age
Be scorn'd like old men of less truth than tongue,
And your true rights be term'd a poet's rage
And stretched metre of an antique song:
But were some child of yours alive that time,
You should live twice; in it and in my rhyme.
Shall I compare thee to a summer's day?
Thou art more lovely and more temperate:
Rough winds do shake the darling buds of May,
And summer's lease hath all too short a date:
Sometime too hot the eye of heaven shines,
And often is his gold complexion dimm'd;
And every fair from fair sometime declines,
By chance or nature's changing course untrimm'd;
But thy eternal summer shall not fade
Nor lose possession of that fair thou owest;
Nor shall Death brag thou wander'st in his shade,
When in eternal lines to time thou growest:
So long as men can breathe or eyes can see,
So long lives this and this gives life to thee.
Devouring Time, blunt thou the lion's paws,
And make the earth devour her own sweet brood;
Pluck the keen teeth from the fierce tiger's jaws,
And burn the long-lived phoenix in her blood;
Make glad and sorry seasons as thou fleets,
And do whate'er thou wilt, swift-footed Time,
To the wide world and all her fading sweets;
But I forbid thee one most heinous crime:
O, carve not with thy hours my love's fair brow,
Nor draw no lines there with thine antique pen;
Him in thy course untainted do allow
For beauty's pattern to succeeding men.
Yet, do thy worst, old Time: despite thy wrong,
My love shall in my verse ever live young.
A woman's face with Nature's own hand painted
Hast thou, the master-mistress of my passion;
A woman's gentle heart, but not acquainted
With shifting change, as is false women's fashion;
An eye more bright than theirs, less false in rolling,
Gilding the object whereupon it gazeth;
A man in hue, all 'hues' in his controlling,
Much steals men's eyes and women's souls amazeth.
And for a woman wert thou first created;
Till Nature, as she wrought thee, fell a-doting,
And by addition me of thee defeated,
By adding one thing to my purpose nothing.
But since she prick'd thee out for women's pleasure,
Mine be thy love and thy love's use their treasure.
So is it not with me as with that Muse
Stirr'd by a painted beauty to his verse,
Who heaven itself for ornament doth use
And every fair with his fair doth rehearse
Making a couplement of proud compare,
With sun and moon, with earth and sea's rich gems,
With April's first-born flowers, and all things rare
That heaven's air in this huge rondure hems.
O' let me, true in love, but truly write,
And then believe me, my love is as fair
As any mother's child, though not so bright
As those gold candles fix'd in heaven's air:
Let them say more than like of hearsay well;
I will not praise that purpose not to sell.
My glass shall not persuade me I am old,
So long as youth and thou are of one date;
But when in thee time's furrows I behold,
Then look I death my days should expiate.
For all that beauty that doth cover thee
Is but the seemly raiment of my heart,
Which in thy breast doth live, as thine in me:
How can I then be elder than thou art?
O, therefore, love, be of thyself so wary
As I, not for myself, but for thee will;
Bearing thy heart, which I will keep so chary
As tender nurse her babe from faring ill.
Presume not on thy heart when mine is slain;
Thou gavest me thine, not to give back again.
As an unperfect actor on the stage
Who with his fear is put besides his part,
Or some fierce thing replete with too much rage,
Whose strength's abundance weakens his own heart.
So I, for fear of trust, forget to say
The perfect ceremony of love's rite,
And in mine own love's strength seem to decay,
O'ercharged with burden of mine own love's might.
O, let my books be then the eloquence
And dumb presagers of my speaking breast,
Who plead for love and look for recompense
More than that tongue that more hath more express'd.
O, learn to read what silent love hath writ:
To hear with eyes belongs to love's fine wit.
Mine eye hath play'd the painter and hath stell'd
Thy beauty's form in table of my heart;
My body is the frame wherein 'tis held,
And perspective it is the painter's art.
For through the painter must you see his skill,
To find where your true image pictured lies;
Which in my bosom's shop is hanging still,
That hath his windows glazed with thine eyes.
Now see what good turns eyes for eyes have done:
Mine eyes have drawn thy shape, and thine for me
Are windows to my breast, where-through the sun
Delights to peep, to gaze therein on thee;
Yet eyes this cunning want to grace their art;
They draw but what they see, know not the heart.
Let those who are in favour with their stars
Of public honour and proud titles boast,
Whilst I, whom fortune of such triumph bars,
Unlook'd for joy in that I honour most.
Great princes' favourites their fair leaves spread
But as the marigold at the sun's eye,
And in themselves their pride lies buried,
For at a frown they in their glory die.
The painful warrior famoused for fight,
After a thousand victories once foil'd,
Is from the book of honour razed quite,
And all the rest forgot for which he toil'd:
Then happy I, that love and am beloved
Where I may not remove nor be removed.
Lord of my love, to whom in vassalage
Thy merit hath my duty strongly knit,
To thee I send this written embassage,
To witness duty, not to show my wit:
Duty so great, which wit so poor as mine
May make seem bare, in wanting words to show it,
But that I hope some good conceit of thine
In thy soul's thought, all naked, will bestow it;
Till whatsoever star that guides my moving
Points on me graciously with fair aspect
And puts apparel on my tatter'd loving,
To show me worthy of thy sweet respect:
Then may I dare to boast how I do love thee;
Till then not show my head where thou mayst prove me.
Weary with toil, I haste me to my bed,
The dear repose for limbs with travel tired;
But then begins a journey in my head,
To work my mind, when body's work's expired:
For then my thoughts, from far where I abide,
Intend a zealous pilgrimage to thee,
And keep my drooping eyelids open wide,
Looking on darkness which the blind do see
Save that my soul's imaginary sight
Presents thy shadow to my sightless view,
Which, like a jewel hung in ghastly night,
Makes black night beauteous and her old face new.
Lo! thus, by day my limbs, by night my mind,
For thee and for myself no quiet find.
How can I then return in happy plight,
That am debarr'd the benefit of rest?
When day's oppression is not eased by night,
But day by night, and night by day, oppress'd?
And each, though enemies to either's reign,
Do in consent shake hands to torture me;
The one by toil, the other to complain
How far I toil, still farther off from thee.
I tell the day, to please them thou art bright
And dost him grace when clouds do blot the heaven:
So flatter I the swart-complexion'd night,
When sparkling stars twire not thou gild'st the even.
But day doth daily draw my sorrows longer
And night doth nightly make grief's strength seem stronger.
When, in disgrace with fortune and men's eyes,
I all alone beweep my outcast state
And trouble deal heaven with my bootless cries
And look upon myself and curse my fate,
Wishing me like to one more rich in hope,
Featured like him, like him with friends possess'd,
Desiring this man's art and that man's scope,
With what I most enjoy contented least;
Yet in these thoughts myself almost despising,
Haply I think on thee, and then my state,
Like to the lark at break of day arising
From sullen earth, sings hymns at heaven's gate;
For thy sweet love remember'd such wealth brings
That then I scorn to change my state with kings.
When to the sessions of sweet silent thought
I summon up remembrance of things past,
I sigh the lack of many a thing I sought,
And with old woes new wail my dear time's waste:
Then can I drown an eye, unused to flow,
For precious friends hid in death's dateless night,
And weep afresh love's long since cancell'd woe,
And moan the expense of many a vanish'd sight:
Then can I grieve at grievances foregone,
And heavily from woe to woe tell o'er
The sad account of fore-bemoaned moan,
Which I new pay as if not paid before.
But if the while I think on thee, dear friend,
All losses are restored and sorrows end.
Thy bosom is endeared with all hearts,
Which I by lacking have supposed dead,
And there reigns love and all love's loving parts,
And all those friends which I thought buried.
How many a holy and obsequious tear
Hath dear religious love stol'n from mine eye
As interest of the dead, which now appear
But things removed that hidden in thee lie!
Thou art the grave where buried love doth live,
Hung with the trophies of my lovers gone,
Who all their parts of me to thee did give;
That due of many now is thine alone:
Their images I loved I view in thee,
And thou, all they, hast all the all of me.
If thou survive my well-contented day,
When that churl Death my bones with dust shall cover,
And shalt by fortune once more re-survey
These poor rude lines of thy deceased lover,
Compare them with the bettering of the time,
And though they be outstripp'd by every pen,
Reserve them for my love, not for their rhyme,
Exceeded by the height of happier men.
O, then vouchsafe me but this loving thought:
'Had my friend's Muse grown with this growing age,
A dearer birth than this his love had brought,
To march in ranks of better equipage:
But since he died and poets better prove,
Theirs for their style I'll read, his for his love.'
Full many a glorious morning have I seen
Flatter the mountain-tops with sovereign eye,
Kissing with golden face the meadows green,
Gilding pale streams with heavenly alchemy;
Anon permit the basest clouds to ride
With ugly rack on his celestial face,
And from the forlorn world his visage hide,
Stealing unseen to west with this disgrace:
Even so my sun one early morn did shine
With all triumphant splendor on my brow;
But out, alack! he was but one hour mine;
The region cloud hath mask'd him from me now.
Yet him for this my love no whit disdaineth;
Suns of the world may stain when heaven's sun staineth.
Why didst thou promise such a beauteous day,
And make me travel forth without my cloak,
To let base clouds o'ertake me in my way,
Hiding thy bravery in their rotten smoke?
'Tis not enough that through the cloud thou break,
To dry the rain on my storm-beaten face,
For no man well of such a salve can speak
That heals the wound and cures not the disgrace:
Nor can thy shame give physic to my grief;
Though thou repent, yet I have still the loss:
The offender's sorrow lends but weak relief
To him that bears the strong offence's cross.
Ah! but those tears are pearl which thy love sheds,
And they are rich and ransom all ill deeds.
No more be grieved at that which thou hast done:
Roses have thorns, and silver fountains mud;
Clouds and eclipses stain both moon and sun,
And loathsome canker lives in sweetest bud.
All men make faults, and even I in this,
Authorizing thy trespass with compare,
Myself corrupting, salving thy amiss,
Excusing thy sins more than thy sins are;
For to thy sensual fault I bring in sense--
Thy adverse party is thy advocate--
And 'gainst myself a lawful plea commence:
Such civil war is in my love and hate
That I an accessary needs must be
To that sweet thief which sourly robs from me.
Let me confess that we two must be twain,
Although our undivided loves are one:
So shall those blots that do with me remain
Without thy help by me be borne alone.
In our two loves there is but one respect,
Though in our lives a separable spite,
Which though it alter not love's sole effect,
Yet doth it steal sweet hours from love's delight.
I may not evermore acknowledge thee,
Lest my bewailed guilt should do thee shame,
Nor thou with public kindness honour me,
Unless thou take that honour from thy name:
But do not so; I love thee in such sort
As, thou being mine, mine is thy good report.
As a decrepit father takes delight
To see his active child do deeds of youth,
So I, made lame by fortune's dearest spite,
Take all my comfort of thy worth and truth.
For whether beauty, birth, or wealth, or wit,
Or any of these all, or all, or more,
Entitled in thy parts do crowned sit,
I make my love engrafted to this store:
So then I am not lame, poor, nor despised,
Whilst that this shadow doth such substance give
That I in thy abundance am sufficed
And by a part of all thy glory live.
Look, what is best, that best I wish in thee:
This wish I have; then ten times happy me!
\ No newline at end of file
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(MACOSX_FOUND TRUE)
endif()
#
# Define environment variables for special configurations
#
set(ENABLE_X86_ISA $ENV{VLLM_CPU_X86})
set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16})
include_directories("${CMAKE_SOURCE_DIR}/csrc")
set (ENABLE_NUMA TRUE)
#
# Check the compile flags
#
if(MACOSX_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-DVLLM_CPU_EXTENSION")
else()
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")
endif()
if (NOT MACOSX_FOUND)
execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)
if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
endif()
endif()
function (find_isa CPUINFO TARGET OUT)
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
if(NOT ISA_FOUND EQUAL -1)
set(${OUT} ON PARENT_SCOPE)
else()
set(${OUT} OFF PARENT_SCOPE)
endif()
endfunction()
function(check_sysctl TARGET OUT)
execute_process(COMMAND sysctl -n "${TARGET}"
RESULT_VARIABLE SYSCTL_RET
OUTPUT_VARIABLE SYSCTL_INFO
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(SYSCTL_RET EQUAL 0 AND
(SYSCTL_INFO STREQUAL "1" OR SYSCTL_INFO GREATER 0))
set(${OUT} ON PARENT_SCOPE)
else()
set(${OUT} OFF PARENT_SCOPE)
endif()
endfunction()
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(STATUS "Apple Silicon Detected")
set(APPLE_SILICON_FOUND TRUE)
set(ENABLE_NUMA OFF)
check_sysctl(hw.optional.neon ASIMD_FOUND)
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
else()
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
# Support cross-compilation by allowing override via environment variables
if (ENABLE_ARM_BF16)
set(ARM_BF16_FOUND ON)
message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable")
endif()
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
set(ENABLE_X86_ISA ON)
if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3))
message(FATAL_ERROR "X86 backend requires gcc/g++ >= 12.3")
endif()
list(APPEND CXX_COMPILE_FLAGS "-mf16c")
list(APPEND CXX_COMPILE_FLAGS_AVX512 ${CXX_COMPILE_FLAGS})
list(APPEND CXX_COMPILE_FLAGS_AVX2 ${CXX_COMPILE_FLAGS})
list(APPEND CXX_COMPILE_FLAGS_AVX512
"-mavx512f"
"-mavx512vl"
"-mavx512bw"
"-mavx512dq"
"-mavx512bf16"
"-mavx512vnni"
"-mamx-bf16"
"-mamx-tile")
list(APPEND CXX_COMPILE_FLAGS_AVX2
"-mavx2")
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
message(STATUS "PowerPC detected")
if (POWER9_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power9"
"-mtune=power9")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected")
if(ARM_BF16_FOUND)
message(STATUS "BF16 extension detected")
set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
add_compile_definitions(ARM_BF16_SUPPORT)
else()
message(WARNING "BF16 functionality is not available")
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
endif()
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
elseif (S390_FOUND)
message(STATUS "S390 detected")
# Check for S390 VXE support
list(APPEND CXX_COMPILE_FLAGS
"-mvx"
"-mzvector"
"-march=native"
"-mtune=native")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
if(RVV_FOUND)
message(FAIL_ERROR "Can't support rvv now.")
else()
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
endif()
else()
message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
# Build oneDNN for GEMM kernels
if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "")
if(ASIMD_FOUND)
# Set number of parallel build processes
include(ProcessorCount)
ProcessorCount(NPROC)
if(NOT NPROC)
set(NPROC 4)
endif()
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
find_library(OPEN_MP
NAMES gomp
PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if (OPEN_MP)
set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
endif()
# Fetch and populate ACL
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
else()
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
FetchContent_Populate(arm_compute
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
GIT_TAG v52.6.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build")
endif()
# Build ACL with CMake
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
-DCMAKE_BUILD_TYPE=Release
-DARM_COMPUTE_ARCH=armv8.2-a
-DARM_COMPUTE_ENABLE_ASSERTS=OFF
-DARM_COMPUTE_ENABLE_CPPTHREADS=OFF
-DARM_COMPUTE_ENABLE_OPENMP=ON
-DARM_COMPUTE_ENABLE_WERROR=OFF
-DARM_COMPUTE_BUILD_EXAMPLES=OFF
-DARM_COMPUTE_BUILD_TESTING=OFF)
set(_cmake_build_cmd
${CMAKE_COMMAND} --build build -- -j${NPROC}
)
execute_process(
COMMAND ${_cmake_config_cmd}
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
)
execute_process(
COMMAND ${_cmake_build_cmd}
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
RESULT_VARIABLE _acl_rc
)
if(NOT _acl_rc EQUAL 0)
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
endif()
message(STATUS "Arm Compute Library (ACL) built successfully.")
# VLLM/oneDNN settings for ACL
set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE)
add_compile_definitions(VLLM_USE_ACL)
endif()
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
FetchContent_Declare(
oneDNN
SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
)
else()
message(STATUS "Downloading oneDNN from GitHub")
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.10
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF")
set(ONEDNN_BUILD_EXAMPLES "OFF")
set(ONEDNN_BUILD_TESTS "OFF")
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ONEDNN_BUILD_GRAPH "OFF")
set(ONEDNN_ENABLE_JIT_PROFILING "ON")
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "ON")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "ON")
set(ONEDNN_VERBOSE "ON")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# TODO: Refactor this
if (ENABLE_X86_ISA)
# Note: only enable oneDNN for AVX512
list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
else()
list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
endif()
set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
FetchContent_MakeAvailable(oneDNN)
set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE})
add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
target_include_directories(
dnnl_ext
PUBLIC ${oneDNN_SOURCE_DIR}/include
PUBLIC ${oneDNN_BINARY_DIR}/include
PRIVATE ${oneDNN_SOURCE_DIR}/src
)
target_link_libraries(dnnl_ext dnnl torch)
target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
list(APPEND LIBS dnnl_ext)
set(USE_ONEDNN ON)
else()
set(USE_ONEDNN OFF)
endif()
# TODO: Refactor this
if (ENABLE_X86_ISA)
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
else()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
endif()
if(ENABLE_NUMA)
list(APPEND LIBS numa)
else()
message(STATUS "NUMA is disabled")
add_compile_definitions(-DVLLM_NUMA_DISABLED)
endif()
#
# Generate CPU attention dispatch header
#
message(STATUS "Generating CPU attention dispatch header")
execute_process(
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/csrc/cpu/generate_cpu_attn_dispatch.py
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/cpu
RESULT_VARIABLE GEN_RESULT
)
if(NOT GEN_RESULT EQUAL 0)
message(FATAL_ERROR "Failed to generate CPU attention dispatch header")
endif()
#
# _C extension
#
set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
"csrc/cpu/cpu_attn.cpp"
"csrc/cpu/torch_bindings.cpp")
if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC})
endif()
if(USE_ONEDNN)
set(VLLM_EXT_SRC
"csrc/cpu/dnnl_kernels.cpp"
${VLLM_EXT_SRC})
endif()
if (ENABLE_X86_ISA)
set(VLLM_EXT_SRC_AVX512
"csrc/cpu/sgl-kernels/gemm.cpp"
"csrc/cpu/sgl-kernels/gemm_int8.cpp"
"csrc/cpu/sgl-kernels/gemm_fp8.cpp"
"csrc/cpu/sgl-kernels/moe.cpp"
"csrc/cpu/sgl-kernels/moe_int8.cpp"
"csrc/cpu/sgl-kernels/moe_fp8.cpp"
"csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/cpu_attn.cpp"
"csrc/cpu/dnnl_kernels.cpp"
"csrc/cpu/torch_bindings.cpp"
# TODO: Remove these files
"csrc/cpu/activation.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
set(VLLM_EXT_SRC_AVX2
"csrc/cpu/utils.cpp"
"csrc/cpu/cpu_attn.cpp"
"csrc/cpu/torch_bindings.cpp"
# TODO: Remove these files
"csrc/cpu/activation.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}")
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
define_extension_target(
_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX512}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
USE_SABI 3
WITH_SOABI
)
# For SGL kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
# For AMX kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
define_extension_target(
_C_AVX2
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX2}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
USE_SABI 3
WITH_SOABI
)
else()
message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}")
#
# Define extension targets
#
define_extension_target(
_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3
WITH_SOABI
)
endif()
message(STATUS "Enabling C extension.")
include(FetchContent)
# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory
# instead of downloading.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{FLASH_MLA_SRC_DIR})
set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
endif()
if(FLASH_MLA_SRC_DIR)
FetchContent_Declare(
flashmla
SOURCE_DIR ${FLASH_MLA_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_MakeAvailable(flashmla)
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# Vendor FlashMLA interface into vLLM with torch-ops shim.
set(FLASHMLA_VENDOR_DIR "${CMAKE_SOURCE_DIR}/vllm/third_party/flashmla")
file(MAKE_DIRECTORY "${FLASHMLA_VENDOR_DIR}")
file(READ "${flashmla_SOURCE_DIR}/flash_mla/flash_mla_interface.py"
FLASHMLA_INTERFACE_CONTENT)
string(REPLACE "import flash_mla.cuda as flash_mla_cuda"
"import vllm._flashmla_C\nflash_mla_cuda = torch.ops._flashmla_C"
FLASHMLA_INTERFACE_CONTENT
"${FLASHMLA_INTERFACE_CONTENT}")
file(WRITE "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
"${FLASHMLA_INTERFACE_CONTENT}")
# Install the generated flash_mla_interface.py to the wheel
# Use COMPONENT _flashmla_C to ensure it's installed with the C extension
install(FILES "${FLASHMLA_VENDOR_DIR}/flash_mla_interface.py"
DESTINATION vllm/third_party/flashmla/
COMPONENT _flashmla_C)
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
set(SUPPORT_ARCHS)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
list(APPEND SUPPORT_ARCHS "9.0a")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
# CUDA 12.9 has introduced "Family-Specific Architecture Features"
# this supports all compute_10x family
list(APPEND SUPPORT_ARCHS "10.0f")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
list(APPEND SUPPORT_ARCHS "10.0a")
endif()
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
# Misc kernels for decoding
${flashmla_SOURCE_DIR}/csrc/smxx/decode/get_decoding_sched_meta/get_decoding_sched_meta.cu
${flashmla_SOURCE_DIR}/csrc/smxx/decode/combine/combine.cu
# sm90 dense decode
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/fp16.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/instantiations/bf16.cu
# sm90 sparse decode
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h64.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/model1_persistent_h128.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h64.cu
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/instantiations/v32_persistent_h128.cu
# sm90 sparse prefill
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k512_topklen.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/instantiations/phase1_k576_topklen.cu
# sm100 dense prefill & backward
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
# sm100 sparse prefill
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k512.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head64/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k512.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd/head128/instantiations/phase1_k576.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_prefill_k512.cu
# sm100 sparse decode
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/v32.cu
${flashmla_SOURCE_DIR}/csrc/sm100/decode/head64/instantiations/model1.cu
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd_for_small_topk/head128/instantiations/phase1_decode_k512.cu
)
set(FlashMLA_Extension_SOURCES
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/kerutils/include
${flashmla_SOURCE_DIR}/csrc/sm90
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
)
set(FlashMLA_Extension_INCLUDES
${flashmla_SOURCE_DIR}/csrc
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
)
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_Extension_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
define_extension_target(
_flashmla_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_SOURCES}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
# Also enable C++20 for the FlashMLA sources (required for std::span, requires, etc.)
target_compile_options(_flashmla_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-std=c++20>
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++20>)
define_extension_target(
_flashmla_extension_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_Extension_SOURCES}
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
USE_SABI 3
WITH_SOABI)
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
target_compile_options(_flashmla_extension_C PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
else()
message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
# Create empty targets for setup.py on unsupported systems
add_custom_target(_flashmla_C)
add_custom_target(_flashmla_extension_C)
endif()
include(FetchContent)
set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory")
if(DEFINED ENV{QUTLASS_SRC_DIR})
set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR})
endif()
if(QUTLASS_SRC_DIR)
FetchContent_Declare(
qutlass
SOURCE_DIR ${QUTLASS_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
qutlass
GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git
GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()
message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)
if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
set(QUTLASS_TARGET_CC 100)
elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
set(QUTLASS_TARGET_CC 120)
else()
message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.")
endif()
set(QUTLASS_SOURCES
${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu
${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu
)
set(QUTLASS_INCLUDES
${qutlass_SOURCE_DIR}
${qutlass_SOURCE_DIR}/qutlass
${qutlass_SOURCE_DIR}/qutlass/csrc/include
${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions
)
if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}")
elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h")
list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include")
message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).")
else()
message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. "
"Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include")
endif()
set_gencode_flags_for_srcs(
SRCS "${QUTLASS_SOURCES}"
CUDA_ARCHS "${QUTLASS_ARCHS}"
)
target_sources(_C PRIVATE ${QUTLASS_SOURCES})
target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES})
target_compile_definitions(_C PRIVATE
QUTLASS_DISABLE_PYBIND=1
TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC}
)
set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3>
)
else()
if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8")
message(STATUS
"[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).")
else()
message(STATUS
"[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in "
"CUDA_ARCHS='${CUDA_ARCHS}'.")
endif()
endif()
# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
set(DEFAULT_TRITON_KERNELS_TAG "v3.6.0")
# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
# be directly set to the triton_kernels python directory.
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
FetchContent_Declare(
triton_kernels
SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
)
else()
set(TRITON_GIT "https://github.com/triton-lang/triton.git")
message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
FetchContent_Declare(
triton_kernels
# TODO (varun) : Fetch just the triton_kernels directory from Triton
GIT_REPOSITORY https://github.com/triton-lang/triton.git
GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
GIT_PROGRESS TRUE
SOURCE_SUBDIR python/triton_kernels/triton_kernels
)
endif()
# Fetch content
FetchContent_MakeAvailable(triton_kernels)
if (NOT triton_kernels_SOURCE_DIR)
message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
endif()
if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
else()
set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
endif()
message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
add_custom_target(triton_kernels)
# Ensure the vllm/third_party directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
## Copy .py files to install directory.
install(DIRECTORY
${TRITON_KERNELS_PYTHON_DIR}
DESTINATION
vllm/third_party/triton_kernels/
COMPONENT triton_kernels
FILES_MATCHING PATTERN "*.py")
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3),
# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 140c00c0241bb60cc6e44e7c1be9998d4b20d8d2
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Make sure vllm-flash-attn install rules are nested under vllm/
# ALL_COMPONENTS ensures the save/modify/restore runs exactly once regardless
# of how many components are being installed, avoiding double-append of /vllm/.
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Restore the install prefix after FA's install rules
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Install shared Python files for both FA2 and FA3 components
foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C)
# Ensure the vllm/vllm_flash_attn directory exists before installation
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")"
COMPONENT ${_FA_COMPONENT})
# Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py
# which are source-controlled in vllm)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm/vllm_flash_attn
COMPONENT ${_FA_COMPONENT}
FILES_MATCHING PATTERN "*.py"
PATTERN "__init__.py" EXCLUDE
PATTERN "flash_attn_interface.py" EXCLUDE
)
endforeach()
#
# FA4 CuteDSL component
# This is a Python-only component that copies the flash_attn/cute directory
# and transforms imports to match our package structure.
#
add_custom_target(_vllm_fa4_cutedsl_C)
# Copy flash_attn/cute directory (needed for FA4) and transform imports
# The cute directory uses flash_attn.cute imports internally, which we replace
# with vllm.vllm_flash_attn.cute to match our package structure.
install(CODE "
file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\")
foreach(SRC_FILE \${CUTE_PY_FILES})
file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE})
set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\")
get_filename_component(DST_DIR \${DST_FILE} DIRECTORY)
file(MAKE_DIRECTORY \${DST_DIR})
file(READ \${SRC_FILE} FILE_CONTENTS)
string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\")
file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\")
endforeach()
" COMPONENT _vllm_fa4_cutedsl_C)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment