Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -8,11 +8,11 @@ from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random,
set_random_seed,
)
logger = init_logger(__name__)
......@@ -36,7 +36,7 @@ def run_benchmark(
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
set_random_seed(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
......
......@@ -7,15 +7,15 @@ import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random_flash,
set_random_seed,
)
from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
logger = init_logger(__name__)
......@@ -49,7 +49,7 @@ def run_benchmark(
if implementation == "triton" and kv_cache_layout == "HND":
return float("nan") # Triton does not support HND layout yet.
current_platform.seed_everything(42)
set_random_seed(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
......
......@@ -23,9 +23,9 @@ import torch
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
persistent_masked_m_silu_mul_quant,
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from vllm.utils.torch_utils import set_random_seed
@triton.jit
......@@ -207,7 +207,7 @@ def benchmark(
):
def generate_data(seed_offset=0):
"""Generate input data with given seed offset"""
current_platform.seed_everything(42 + seed_offset)
set_random_seed(42 + seed_offset)
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
if gen_strategy == "random_imbalanced":
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys
import time
import numpy as np
import torch
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Check if CPU MoE operations are available
try:
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
except (ImportError, AttributeError) as e:
print("ERROR: CPU fused MoE operations are not available on this platform.")
print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.")
print(
"The cpu_fused_moe kernel is typically available on Linux x86_64 "
"with AVX2/AVX512."
)
print(f"Import error: {e}")
sys.exit(1)
# ISA selection following test_cpu_fused_moe.py pattern
ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
@torch.inference_mode()
def main(
batch_size: int,
expert_num: int,
hidden_size: int,
intermediate_size: int,
topk_num: int,
use_bias: bool = False,
dtype: torch.dtype = torch.bfloat16,
activation: str = "silu",
isa: str = "vec",
seed: int = 0,
iters: int = 20,
) -> None:
current_platform.seed_everything(seed)
# up_dim = 2 * intermediate_size for gate + up projection
up_dim = 2 * intermediate_size
input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
0.5 * intermediate_size**0.5
)
w13_bias = None
w2_bias = None
if use_bias:
w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
0.5 * hidden_size**0.5
)
router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
topk_weights, topk_ids = torch.topk(score, topk_num)
topk_ids = topk_ids.to(torch.int32)
packed_w13 = cpu_prepack_moe_weight(w13, isa)
packed_w2 = cpu_prepack_moe_weight(w2, isa)
def run_benchmark(iters: int) -> list[float]:
times = []
for _ in range(iters):
start_time = time.perf_counter_ns()
_ = cpu_fused_moe(
input_tensor,
packed_w13,
packed_w2,
w13_bias,
w2_bias,
topk_weights,
topk_ids,
activation,
isa,
)
end_time = time.perf_counter_ns()
times.append((end_time - start_time) / 1e6)
return times
# warmup
run_benchmark(5)
# benchmark
times = run_benchmark(iters)
if not times:
print("No iterations to measure. Set --iters > 0.")
return
time_min = min(times)
time_max = max(times)
time_mean = np.mean(times)
time_std = np.std(times)
print("\tmin (ms) = ", time_min)
print("\tmax (ms) = ", time_max)
print("\tmean (ms) = ", time_mean)
print("\tstd = ", time_std)
print("\tmedian (ms) = ", np.median(times))
# Calculate throughput metrics
# FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden)
flops_per_token = (
2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size)
)
total_flops = batch_size * flops_per_token
tflops = total_flops / (time_mean * 1e-3) / 1e12
print(f"\tthroughput (TFLOP/s) = {tflops:.4f}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.")
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument("--expert-num", type=int, default=8)
parser.add_argument("--hidden-size", type=int, default=2880)
parser.add_argument("--intermediate-size", type=int, default=2880)
parser.add_argument(
"--topk-num",
type=int,
default=None,
help="Number of experts to route each token to (default: expert_num // 2)",
)
parser.add_argument("--use-bias", action="store_true")
parser.add_argument(
"--activation",
type=str,
choices=["silu", "swigluoai"],
default="silu",
help="Activation function",
)
parser.add_argument(
"--isa",
type=str,
choices=ISA_CHOICES,
default=ISA_CHOICES[0],
help=f"ISA to use (available: {ISA_CHOICES})",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--iters", type=int, default=20)
args = parser.parse_args()
# Default topk_num to expert_num // 2, minimum 1
topk_num = (
args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1)
)
print(args)
main(
batch_size=args.batch_size,
expert_num=args.expert_num,
hidden_size=args.hidden_size,
intermediate_size=args.intermediate_size,
topk_num=topk_num,
use_bias=args.use_bias,
dtype=torch.bfloat16, # Following test_cpu_fused_moe.py
activation=args.activation,
isa=args.isa,
seed=args.seed,
iters=args.iters,
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment