Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
260 additions
and
53 deletions
+260
-53
benchmarks/benchmark_serving_structured_output.py
benchmarks/benchmark_serving_structured_output.py
+1
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+1
-0
benchmarks/benchmark_utils.py
benchmarks/benchmark_utils.py
+7
-1
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+1
-0
benchmarks/cutlass_benchmarks/utils.py
benchmarks/cutlass_benchmarks/utils.py
+1
-0
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+1
-0
benchmarks/cutlass_benchmarks/weight_shapes.py
benchmarks/cutlass_benchmarks/weight_shapes.py
+1
-0
benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+1
-0
benchmarks/disagg_benchmarks/round_robin_proxy.py
benchmarks/disagg_benchmarks/round_robin_proxy.py
+1
-0
benchmarks/disagg_benchmarks/visualize_benchmark_results.py
benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+1
-0
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+1
-0
benchmarks/kernels/bench_fp8_gemm.py
benchmarks/kernels/bench_fp8_gemm.py
+223
-0
benchmarks/kernels/benchmark_aqlm.py
benchmarks/kernels/benchmark_aqlm.py
+1
-0
benchmarks/kernels/benchmark_bitblas.py
benchmarks/kernels/benchmark_bitblas.py
+1
-0
benchmarks/kernels/benchmark_cutlass_fp4_moe.py
benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+2
-1
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+12
-50
benchmarks/kernels/benchmark_layernorm.py
benchmarks/kernels/benchmark_layernorm.py
+1
-0
benchmarks/kernels/benchmark_lora.py
benchmarks/kernels/benchmark_lora.py
+1
-0
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_machete.py
+1
-0
benchmarks/kernels/benchmark_marlin.py
benchmarks/kernels/benchmark_marlin.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
benchmarks/benchmark_serving_structured_output.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r
"""Benchmark online serving throughput with structured outputs.
r
"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
On the server side, run one of the following commands:
...
@@ -11,7 +12,6 @@ On the client side, run:
...
@@ -11,7 +12,6 @@ On the client side, run:
--model <your_model> \
--model <your_model> \
--dataset json \
--dataset json \
--structured-output-ratio 1.0 \
--structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \
--request-rate 10 \
--num-prompts 1000
--num-prompts 1000
...
...
benchmarks/benchmark_throughput.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
"""Benchmark offline inference throughput."""
import
argparse
import
argparse
...
...
benchmarks/benchmark_utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
json
import
json
...
@@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
...
@@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
)
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
,
default
=
lambda
o
:
f
"<
{
type
(
o
).
__name__
}
object is not JSON serializable>"
,
)
benchmarks/cutlass_benchmarks/sparse_benchmarks.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
copy
import
copy
...
...
benchmarks/cutlass_benchmarks/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
# Cutlass bench utils
from
collections.abc
import
Iterable
from
collections.abc
import
Iterable
...
...
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
copy
import
copy
...
...
benchmarks/cutlass_benchmarks/weight_shapes.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
# ([K, N], TP_SPLIT_DIM)
...
...
benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
os
...
...
benchmarks/disagg_benchmarks/round_robin_proxy.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
asyncio
import
itertools
import
itertools
...
...
benchmarks/disagg_benchmarks/visualize_benchmark_results.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
json
...
...
benchmarks/fused_kernels/layernorm_rms_benchmarks.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pickle
as
pkl
import
pickle
as
pkl
import
time
import
time
...
...
benchmarks/kernels/bench_fp8_gemm.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
copy
import
itertools
import
torch
from
weight_shapes
import
WEIGHT_SHAPES
from
vllm._custom_ops
import
cutlass_scaled_mm
as
vllm_scaled_mm
from
vllm._custom_ops
import
scaled_fp8_quant
as
vllm_scaled_fp8_quant
from
vllm.triton_utils
import
triton
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"batch_size"
],
x_vals
=
[
1
,
16
,
64
,
128
,
256
,
512
,
1024
,
2048
,
4096
,
8192
,
16384
],
x_log
=
False
,
line_arg
=
"provider"
,
line_vals
=
[
"torch-bf16"
,
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a"
,
"fp8-channel-w-token-a"
,
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant"
,
"fp8-channel-w-token-a-noquant"
,
# "fp8-channel-w-tensor-a-noquant",
],
line_names
=
[
"torch-bf16"
,
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a"
,
"fp8-channel-w-token-a"
,
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant"
,
"fp8-channel-w-token-a-noquant"
,
# "fp8-channel-w-tensor-a-noquant",
],
ylabel
=
"TFLOP/s (larger is better)"
,
plot_name
=
"BF16 vs FP8 GEMMs"
,
args
=
{},
)
)
def
benchmark
(
batch_size
,
provider
,
N
,
K
):
M
=
batch_size
device
=
"cuda"
dtype
=
torch
.
bfloat16
# Create input tensors
a
=
torch
.
randn
((
M
,
K
),
device
=
device
,
dtype
=
dtype
)
b
=
torch
.
randn
((
N
,
K
),
device
=
device
,
dtype
=
dtype
)
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
"torch-bf16"
in
provider
:
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
torch
.
nn
.
functional
.
linear
(
a
,
b
),
quantiles
=
quantiles
)
elif
"fp8"
in
provider
:
# Weights are always quantized ahead of time
if
"noquant"
in
provider
:
# For no quantization, we just measure the GEMM
if
"tensor-w-token-a"
in
provider
:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
)
assert
scale_b_fp8
.
numel
()
==
1
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
def
run_quant
():
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"tensor-w-tensor-a"
in
provider
:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
assert
scale_b_fp8
.
numel
()
==
1
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a
)
def
run_quant
():
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"channel-w-token-a"
in
provider
:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b
=
torch
.
tensor
((
N
,),
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
scale_b_fp8
=
scale_b_fp8
.
expand
(
N
).
contiguous
()
assert
scale_b_fp8
.
numel
()
==
N
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
def
run_quant
():
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"channel-w-tensor-a"
in
provider
:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
((
N
,),
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
scale_b_fp8
=
scale_b_fp8
.
expand
(
N
).
contiguous
()
assert
scale_b_fp8
.
numel
()
==
N
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a
)
def
run_quant
():
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
else
:
# In these cases, we quantize the activations during the GEMM call
if
"tensor-w-token-a"
in
provider
:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
)
assert
scale_b_fp8
.
numel
()
==
1
def
run_quant
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"tensor-w-tensor-a"
in
provider
:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
assert
scale_b_fp8
.
numel
()
==
1
def
run_quant
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"channel-w-token-a"
in
provider
:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b
=
torch
.
tensor
((
N
,),
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
scale_b_fp8
=
scale_b_fp8
.
expand
(
N
).
contiguous
()
assert
scale_b_fp8
.
numel
()
==
N
def
run_quant
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
elif
"channel-w-tensor-a"
in
provider
:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a
=
torch
.
tensor
([
1.0
],
device
=
device
,
dtype
=
torch
.
float32
)
scale_b
=
torch
.
tensor
((
N
,),
device
=
device
,
dtype
=
torch
.
float32
)
b_fp8
,
scale_b_fp8
=
vllm_scaled_fp8_quant
(
b
,
scale_b
)
scale_b_fp8
=
scale_b_fp8
.
expand
(
N
).
contiguous
()
assert
scale_b_fp8
.
numel
()
==
N
def
run_quant
():
a_fp8
,
scale_a_fp8
=
vllm_scaled_fp8_quant
(
a
,
scale_a
)
return
vllm_scaled_mm
(
a_fp8
,
b_fp8
,
scale_a_fp8
,
scale_b_fp8
,
dtype
)
b_fp8
=
b_fp8
.
t
()
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench_cudagraph
(
lambda
:
run_quant
(),
quantiles
=
quantiles
)
# Calculate TFLOP/s, two flops per multiply-add
tflops
=
lambda
ms
:
(
2
*
M
*
N
*
K
)
*
1e-12
/
(
ms
*
1e-3
)
return
tflops
(
ms
),
tflops
(
max_ms
),
tflops
(
min_ms
)
def
prepare_shapes
(
args
):
KN_model_names
=
[]
models_tps
=
list
(
itertools
.
product
(
args
.
models
,
args
.
tp_sizes
))
for
model
,
tp_size
in
models_tps
:
assert
model
in
WEIGHT_SHAPES
for
KN
,
tp_split_dim
in
copy
.
deepcopy
(
WEIGHT_SHAPES
[
model
]):
KN
[
tp_split_dim
]
=
KN
[
tp_split_dim
]
//
tp_size
KN
.
append
(
model
)
KN_model_names
.
append
(
KN
)
return
KN_model_names
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--models"
,
nargs
=
"+"
,
type
=
str
,
default
=
[
"meta-llama/Llama-3.1-8B-Instruct"
],
choices
=
[
*
WEIGHT_SHAPES
.
keys
()],
help
=
"List of models to benchmark"
,
)
parser
.
add_argument
(
"--tp-sizes"
,
nargs
=
"+"
,
type
=
int
,
default
=
[
1
],
help
=
"List of tensor parallel sizes"
,
)
args
=
parser
.
parse_args
()
KN_model_names
=
prepare_shapes
(
args
)
for
K
,
N
,
model_name
in
KN_model_names
:
print
(
f
"
{
model_name
}
, N=
{
N
}
K=
{
K
}
, BF16 vs FP8 GEMMs TFLOP/s:"
)
benchmark
.
run
(
print_data
=
True
,
show_plots
=
True
,
save_path
=
f
"bench_fp8_res_n
{
N
}
_k
{
K
}
"
,
N
=
N
,
K
=
K
,
)
print
(
"Benchmark finished!"
)
benchmarks/kernels/benchmark_aqlm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
os
import
sys
import
sys
...
...
benchmarks/kernels/benchmark_bitblas.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Licensed under the MIT License.
...
...
benchmarks/kernels/benchmark_cutlass_fp4_moe.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
"""
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
...
@@ -90,7 +91,7 @@ def bench_run(
...
@@ -90,7 +91,7 @@ def bench_run(
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
device
,
dtype
=
dtype
)
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
device
,
dtype
=
dtype
)
topk_weights
,
topk_ids
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
topk_weights
,
topk_ids
,
_
=
fused_topk
(
a
,
score
,
topk
,
renormalize
=
False
)
quant_blocksize
=
16
quant_blocksize
=
16
w1_blockscale
=
torch
.
empty
(
w1_blockscale
=
torch
.
empty
(
...
...
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch
import
torch.utils.benchmark
as
benchmark
import
torch.utils.benchmark
as
benchmark
...
@@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
...
@@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.cutlass_moe
import
cutlass_moe_fp8
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
cutlass_moe_fp8
,
fused_experts
,
fused_experts
,
fused_topk
,
fused_topk
,
)
)
...
@@ -69,18 +70,9 @@ def bench_run(
...
@@ -69,18 +70,9 @@ def bench_run(
w1_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w1_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
empty
((
num_experts
,
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
ab_strides1
=
torch
.
full
((
num_experts
,),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides1
=
torch
.
full
((
num_experts
,),
2
*
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
ab_strides2
=
torch
.
full
((
num_experts
,),
n
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
c_strides2
=
torch
.
full
((
num_experts
,),
k
,
device
=
"cuda"
,
dtype
=
torch
.
int64
)
for
expert
in
range
(
num_experts
):
for
expert
in
range
(
num_experts
):
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
])
w1_q
[
expert
],
w1_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w1
[
expert
])
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
])
w2_q
[
expert
],
w2_scale
[
expert
]
=
ops
.
scaled_fp8_quant
(
w2
[
expert
])
w1_q_notransp
=
w1_q
.
clone
()
w2_q_notransp
=
w2_q
.
clone
()
w1_q
=
w1_q
.
transpose
(
1
,
2
)
w2_q
=
w2_q
.
transpose
(
1
,
2
)
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
"cuda"
,
dtype
=
dtype
)
score
=
torch
.
randn
((
m
,
num_experts
),
device
=
"cuda"
,
dtype
=
dtype
)
...
@@ -121,10 +113,6 @@ def bench_run(
...
@@ -121,10 +113,6 @@ def bench_run(
w2_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
ab_strides1
:
torch
.
Tensor
,
c_strides1
:
torch
.
Tensor
,
ab_strides2
:
torch
.
Tensor
,
c_strides2
:
torch
.
Tensor
,
num_repeats
:
int
,
num_repeats
:
int
,
):
):
for
_
in
range
(
num_repeats
):
for
_
in
range
(
num_repeats
):
...
@@ -132,14 +120,10 @@ def bench_run(
...
@@ -132,14 +120,10 @@ def bench_run(
a
,
a
,
w1
,
w1
,
w2
,
w2
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
ab_strides1
,
w1_scale
,
c_strides1
,
w2_scale
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale
,
a1_scale
=
a_scale
,
)
)
...
@@ -152,10 +136,6 @@ def bench_run(
...
@@ -152,10 +136,6 @@ def bench_run(
w2_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
ab_strides1
:
torch
.
Tensor
,
c_strides1
:
torch
.
Tensor
,
ab_strides2
:
torch
.
Tensor
,
c_strides2
:
torch
.
Tensor
,
):
):
with
set_current_vllm_config
(
with
set_current_vllm_config
(
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
VllmConfig
(
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
))
...
@@ -164,14 +144,10 @@ def bench_run(
...
@@ -164,14 +144,10 @@ def bench_run(
a
,
a
,
w1_q
,
w1_q
,
w2_q
,
w2_q
,
w1_scale
,
w2_scale
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
ab_strides1
,
w1_scale
,
c_strides1
,
w2_scale
,
ab_strides2
,
c_strides2
,
a1_scale
=
a_scale
,
a1_scale
=
a_scale
,
)
)
...
@@ -217,10 +193,6 @@ def bench_run(
...
@@ -217,10 +193,6 @@ def bench_run(
w2_scale
,
w2_scale
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
)
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
...
@@ -229,8 +201,8 @@ def bench_run(
...
@@ -229,8 +201,8 @@ def bench_run(
with
torch
.
cuda
.
graph
(
triton_graph
,
stream
=
triton_stream
):
with
torch
.
cuda
.
graph
(
triton_graph
,
stream
=
triton_stream
):
run_triton_from_graph
(
run_triton_from_graph
(
a
,
a
,
w1_q
_notransp
,
w1_q
,
w2_q
_notransp
,
w2_q
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
w1_scale
,
w1_scale
,
...
@@ -249,18 +221,12 @@ def bench_run(
...
@@ -249,18 +221,12 @@ def bench_run(
"w2"
:
w2
,
"w2"
:
w2
,
"score"
:
score
,
"score"
:
score
,
"topk"
:
topk
,
"topk"
:
topk
,
"w1_q_notransp"
:
w1_q_notransp
,
"w2_q_notransp"
:
w2_q_notransp
,
# Cutlass params
# Cutlass params
"a_scale"
:
a_scale
,
"a_scale"
:
a_scale
,
"w1_q"
:
w1_q
,
"w1_q"
:
w1_q
,
"w2_q"
:
w2_q
,
"w2_q"
:
w2_q
,
"w1_scale"
:
w1_scale
,
"w1_scale"
:
w1_scale
,
"w2_scale"
:
w2_scale
,
"w2_scale"
:
w2_scale
,
"ab_strides1"
:
ab_strides1
,
"c_strides1"
:
c_strides1
,
"ab_strides2"
:
ab_strides2
,
"c_strides2"
:
c_strides2
,
# cuda graph params
# cuda graph params
"cutlass_graph"
:
cutlass_graph
,
"cutlass_graph"
:
cutlass_graph
,
"triton_graph"
:
triton_graph
,
"triton_graph"
:
triton_graph
,
...
@@ -278,8 +244,8 @@ def bench_run(
...
@@ -278,8 +244,8 @@ def bench_run(
# Warmup
# Warmup
run_triton_moe
(
run_triton_moe
(
a
,
a
,
w1_q
_notransp
,
w1_q
,
w2_q
_notransp
,
w2_q
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
w1_scale
,
w1_scale
,
...
@@ -290,7 +256,7 @@ def bench_run(
...
@@ -290,7 +256,7 @@ def bench_run(
results
.
append
(
results
.
append
(
benchmark
.
Timer
(
benchmark
.
Timer
(
stmt
=
"run_triton_moe(a, w1_q
_notransp, w2_q_notransp
, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)"
,
# noqa: E501
stmt
=
"run_triton_moe(a, w1_q
, w2_q
, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)"
,
# noqa: E501
globals
=
globals
,
globals
=
globals
,
label
=
label
,
label
=
label
,
sub_label
=
sub_label
,
sub_label
=
sub_label
,
...
@@ -321,16 +287,12 @@ def bench_run(
...
@@ -321,16 +287,12 @@ def bench_run(
w2_scale
,
w2_scale
,
topk_weights
,
topk_weights
,
topk_ids
,
topk_ids
,
ab_strides1
,
c_strides1
,
ab_strides2
,
c_strides2
,
num_warmup
,
num_warmup
,
)
)
results
.
append
(
results
.
append
(
benchmark
.
Timer
(
benchmark
.
Timer
(
stmt
=
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids,
ab_strides1, c_strides1, ab_strides2, c_strides2,
num_runs)"
,
# noqa: E501
stmt
=
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)"
,
# noqa: E501
globals
=
globals
,
globals
=
globals
,
label
=
label
,
label
=
label
,
sub_label
=
sub_label
,
sub_label
=
sub_label
,
...
...
benchmarks/kernels/benchmark_layernorm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
import
time
...
...
benchmarks/kernels/benchmark_lora.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
copy
import
copy
...
...
benchmarks/kernels/benchmark_machete.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
argparse
import
copy
import
copy
...
...
benchmarks/kernels/benchmark_marlin.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch
import
torch.utils.benchmark
as
benchmark
import
torch.utils.benchmark
as
benchmark
...
...
Prev
1
2
3
4
5
6
7
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment