Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
100 additions
and
21 deletions
+100
-21
benchmarks/kernels/benchmark_moe.py
benchmarks/kernels/benchmark_moe.py
+5
-10
benchmarks/kernels/benchmark_moe_permute_unpermute.py
benchmarks/kernels/benchmark_moe_permute_unpermute.py
+1
-0
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+1
-0
benchmarks/kernels/benchmark_quant.py
benchmarks/kernels/benchmark_quant.py
+1
-0
benchmarks/kernels/benchmark_rmsnorm.py
benchmarks/kernels/benchmark_rmsnorm.py
+1
-0
benchmarks/kernels/benchmark_rope.py
benchmarks/kernels/benchmark_rope.py
+2
-1
benchmarks/kernels/benchmark_shapes.py
benchmarks/kernels/benchmark_shapes.py
+1
-0
benchmarks/kernels/benchmark_w8a8_block_fp8.py
benchmarks/kernels/benchmark_w8a8_block_fp8.py
+1
-0
benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
...hmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+1
-0
benchmarks/kernels/graph_machete_bench.py
benchmarks/kernels/graph_machete_bench.py
+1
-0
benchmarks/kernels/utils.py
benchmarks/kernels/utils.py
+1
-0
benchmarks/kernels/weight_shapes.py
benchmarks/kernels/weight_shapes.py
+47
-0
benchmarks/overheads/benchmark_hashing.py
benchmarks/overheads/benchmark_hashing.py
+1
-0
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+13
-6
cmake/external_projects/vllm_flash_attn.cmake
cmake/external_projects/vllm_flash_attn.cmake
+18
-2
cmake/hipify.py
cmake/hipify.py
+1
-0
cmake/utils.cmake
cmake/utils.cmake
+1
-1
csrc/attention/mla/cutlass_mla_kernels.cu
csrc/attention/mla/cutlass_mla_kernels.cu
+1
-1
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+1
-0
csrc/moe/marlin_moe_wna16/generate_kernels.py
csrc/moe/marlin_moe_wna16/generate_kernels.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
benchmarks/kernels/benchmark_moe.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
...
...
@@ -6,7 +7,6 @@ import time
from
contextlib
import
nullcontext
from
datetime
import
datetime
from
itertools
import
product
from
types
import
SimpleNamespace
from
typing
import
Any
,
TypedDict
import
ray
...
...
@@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
num_iters
:
int
=
100
,
block_quant_shape
:
L
ist
[
int
]
=
None
,
block_quant_shape
:
l
ist
[
int
]
=
None
,
use_deep_gemm
:
bool
=
False
,
)
->
float
:
init_dtype
=
torch
.
float16
if
use_fp8_w8a8
else
dtype
...
...
@@ -399,7 +399,7 @@ class BenchmarkWorker:
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
block_quant_shape
:
L
ist
[
int
]
=
None
,
block_quant_shape
:
l
ist
[
int
]
=
None
,
use_deep_gemm
:
bool
=
False
,
)
->
tuple
[
dict
[
str
,
int
],
float
]:
current_platform
.
seed_everything
(
self
.
seed
)
...
...
@@ -531,7 +531,7 @@ def save_configs(
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
block_quant_shape
:
L
ist
[
int
],
block_quant_shape
:
l
ist
[
int
],
)
->
None
:
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
use_fp8_w8a8
=
use_fp8_w8a8
...
...
@@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
config
=
get_config
(
model
=
args
.
model
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
model_prefix
:
config
=
getattr
(
config
,
args
.
model_prefix
)
config
=
SimpleNamespace
(
**
config
)
if
config
.
architectures
[
0
]
==
"DbrxForCausalLM"
:
E
=
config
.
ffn_config
.
moe_num_experts
...
...
@@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
hidden_size
=
config
.
hidden_size
dtype
=
(
torch
.
float16
if
current_platform
.
is_rocm
()
else
getattr
(
torch
,
config
.
torch_dtype
)
)
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
torch_dtype
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
block_quant_shape
=
get_weight_block_size_safety
(
config
)
...
...
benchmarks/kernels/benchmark_moe_permute_unpermute.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
from
typing
import
Any
,
TypedDict
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
time
...
...
benchmarks/kernels/benchmark_quant.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
time
...
...
benchmarks/kernels/benchmark_rmsnorm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
itertools
from
typing
import
Optional
,
Union
...
...
benchmarks/kernels/benchmark_rope.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
itertools
import
accumulate
from
typing
import
Optional
...
...
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
seed
:
int
,
device
:
str
,
max_position
:
int
=
8192
,
base
:
in
t
=
10000
,
base
:
floa
t
=
10000
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
...
...
benchmarks/kernels/benchmark_shapes.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES
=
{
"ideal"
:
[[
4
*
256
*
32
,
256
*
32
]],
...
...
benchmarks/kernels/benchmark_w8a8_block_fp8.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py
import
argparse
...
...
benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import
time
...
...
benchmarks/kernels/graph_machete_bench.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
pickle
...
...
benchmarks/kernels/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
dataclasses
from
collections.abc
import
Iterable
...
...
benchmarks/kernels/weight_shapes.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)
...
...
@@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
([
16384
,
106496
],
1
),
([
53248
,
16384
],
0
),
],
"meta-llama/Llama-3.1-8B-Instruct"
:
[
([
4096
,
6144
],
1
),
([
4096
,
4096
],
0
),
([
4096
,
28672
],
1
),
([
14336
,
4096
],
0
),
],
"meta-llama/Llama-3.3-70B-Instruct"
:
[
([
8192
,
10240
],
1
),
([
8192
,
8192
],
0
),
([
8192
,
57344
],
1
),
([
28672
,
8192
],
0
),
],
"mistralai/Mistral-Large-Instruct-2407"
:
[
([
12288
,
14336
],
1
),
([
12288
,
12288
],
0
),
([
12288
,
57344
],
1
),
([
28672
,
12288
],
0
),
],
"Qwen/Qwen2.5-7B-Instruct"
:
[
([
3584
,
4608
],
1
),
([
3584
,
3584
],
0
),
([
3584
,
37888
],
1
),
([
18944
,
3584
],
0
),
],
"Qwen/Qwen2.5-32B-Instruct"
:
[
([
5120
,
7168
],
1
),
([
5120
,
5120
],
0
),
([
5120
,
55296
],
1
),
([
27648
,
5120
],
0
),
],
"Qwen/Qwen2.5-72B-Instruct"
:
[
([
8192
,
10240
],
1
),
([
8192
,
8192
],
0
),
([
8192
,
59136
],
1
),
([
29568
,
8192
],
0
),
],
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
[
([
2048
,
3072
],
1
),
([
2048
,
4096
],
1
),
([
2048
,
2048
],
0
),
([
2048
,
576
],
0
),
([
2048
,
21888
],
1
),
([
10944
,
2048
],
0
),
([
2048
,
2816
],
1
),
([
1408
,
2048
],
0
),
],
}
benchmarks/overheads/benchmark_hashing.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
cProfile
import
pstats
...
...
cmake/cpu_extension.cmake
View file @
cc7f22a8
...
...
@@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else
()
find_isa
(
${
CPUINFO
}
"avx2"
AVX2_FOUND
)
find_isa
(
${
CPUINFO
}
"avx512f"
AVX512_FOUND
)
find_isa
(
${
CPUINFO
}
"Power11"
POWER11_FOUND
)
find_isa
(
${
CPUINFO
}
"POWER10"
POWER10_FOUND
)
find_isa
(
${
CPUINFO
}
"POWER9"
POWER9_FOUND
)
find_isa
(
${
CPUINFO
}
"asimd"
ASIMD_FOUND
)
# Check for ARM NEON support
...
...
@@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
list
(
APPEND CXX_COMPILE_FLAGS
"-mavx2"
)
message
(
WARNING
"vLLM CPU backend using AVX2 ISA"
)
elseif
(
POWER9_FOUND OR POWER10_FOUND
)
elseif
(
POWER9_FOUND OR POWER10_FOUND
OR POWER11_FOUND
)
message
(
STATUS
"PowerPC detected"
)
# Check for PowerPC VSX support
list
(
APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=native"
"-mtune=native"
)
if
(
POWER9_FOUND
)
list
(
APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power9"
"-mtune=power9"
)
elseif
(
POWER10_FOUND OR POWER11_FOUND
)
list
(
APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10"
)
endif
()
elseif
(
ASIMD_FOUND
)
message
(
STATUS
"ARMv8 or later architecture detected"
)
...
...
cmake/external_projects/vllm_flash_attn.cmake
View file @
cc7f22a8
...
...
@@ -46,22 +46,38 @@ else()
endif
()
# Ensure the vllm/vllm_flash_attn directory exists before installation
install
(
CODE
"file(MAKE_DIRECTORY
\"\$
{CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn
\"
)"
ALL_COMPONENTS
)
# Make sure vllm-flash-attn install rules are nested under vllm/
# This is here to support installing all components under the same prefix with cmake --install.
# setup.py installs every component separately but uses the same prefix for all.
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
# and these statements don't hurt when installing neither component.
install
(
CODE
"set(CMAKE_INSTALL_LOCAL_ONLY FALSE)"
ALL_COMPONENTS
)
install
(
CODE
"set(OLD_CMAKE_INSTALL_PREFIX
\"\$
{CMAKE_INSTALL_PREFIX}
\"
)"
ALL_COMPONENTS
)
install
(
CODE
"set(CMAKE_INSTALL_PREFIX
\"\$
{CMAKE_INSTALL_PREFIX}/vllm/
\"
)"
ALL_COMPONENTS
)
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable
(
vllm-flash-attn
)
message
(
STATUS
"vllm-flash-attn is available at
${
vllm-flash-attn_SOURCE_DIR
}
"
)
# Restore the install prefix
install
(
CODE
"set(CMAKE_INSTALL_PREFIX
\"\$
{OLD_CMAKE_INSTALL_PREFIX}
\"
)"
ALL_COMPONENTS
)
install
(
CODE
"set(CMAKE_INSTALL_LOCAL_ONLY TRUE)"
ALL_COMPONENTS
)
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install
(
DIRECTORY
${
vllm-flash-attn_SOURCE_DIR
}
/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION
vllm/
vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN
"*.py"
)
install
(
DIRECTORY
${
vllm-flash-attn_SOURCE_DIR
}
/vllm_flash_attn/
DESTINATION vllm_flash_attn
DESTINATION
vllm/
vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN
"*.py"
)
cmake/hipify.py
View file @
cc7f22a8
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# A command line tool for running pytorch's hipify preprocessor on CUDA
...
...
cmake/utils.cmake
View file @
cc7f22a8
...
...
@@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
set
(
CSRC_BUILD_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
/csrc
)
add_custom_target
(
hipify
${
NAME
}
COMMAND
${
CMAKE_SOURCE_DIR
}
/cmake/hipify.py -p
${
CMAKE_SOURCE_DIR
}
/csrc -o
${
CSRC_BUILD_DIR
}
${
SRCS
}
COMMAND
${
Python_EXECUTABLE
}
${
CMAKE_SOURCE_DIR
}
/cmake/hipify.py -p
${
CMAKE_SOURCE_DIR
}
/csrc -o
${
CSRC_BUILD_DIR
}
${
SRCS
}
DEPENDS
${
CMAKE_SOURCE_DIR
}
/cmake/hipify.py
${
SRCS
}
BYPRODUCTS
${
HIP_SRCS
}
COMMENT
"Running hipify on
${
NAME
}
extension source files."
)
...
...
csrc/attention/mla/cutlass_mla_kernels.cu
View file @
cc7f22a8
...
...
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{
static_cast
<
ElementOut
*>
(
out
.
data_ptr
()),
stride_O
,
static_cast
<
ElementAcc
*>
(
nullptr
),
stride_LSE
},
hw_info
,
-
1
,
// split_kv
1
,
// split_kv
nullptr
,
// is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
...
...
csrc/cutlass_extensions/vllm_cutlass_library_extension.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
enum
from
typing
import
Union
...
...
csrc/moe/marlin_moe_wna16/generate_kernels.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
glob
import
itertools
import
os
...
...
Prev
1
2
3
4
5
6
7
8
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment