Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
49e8c7ea
Unverified
Commit
49e8c7ea
authored
Jul 10, 2025
by
Michael Goin
Committed by
GitHub
Jul 09, 2025
Browse files
Use NVCC `--compress-mode` to reduce binary size by 30% (#20694)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
805d62ca
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
12 deletions
+19
-12
CMakeLists.txt
CMakeLists.txt
+19
-12
No files found.
CMakeLists.txt
View file @
49e8c7ea
...
@@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND VLLM_GPU_FLAGS
"--threads=
${
NVCC_THREADS
}
"
)
list
(
APPEND VLLM_GPU_FLAGS
"--threads=
${
NVCC_THREADS
}
"
)
endif
()
endif
()
#
# Set nvcc fatbin compression.
#
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.8 AND VLLM_GPU_LANG STREQUAL
"CUDA"
)
list
(
APPEND VLLM_GPU_FLAGS
"-Xfatbin"
"-compress-all"
"-compress-mode=size"
)
endif
()
#
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
...
@@ -392,7 +399,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -392,7 +399,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later
# CUDA 12.0 or later
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"9.0a;"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"9.0a;"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.0 AND SCALED_MM_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.0 AND SCALED_MM_ARCHS
)
set
(
SRCS
set
(
SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
...
@@ -408,7 +415,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -408,7 +415,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm90 for archs:
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm90 for archs:
${
SCALED_MM_ARCHS
}
"
)
else
()
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.0 AND SCALED_MM_ARCHS
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.0 AND SCALED_MM_ARCHS
)
message
(
STATUS
"Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
message
(
STATUS
"Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"later if you intend on running FP8 quantized models on "
...
@@ -423,7 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -423,7 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
# CUDA 12.8 or later
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"12.0;12.0a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"12.0;12.0a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND SCALED_MM_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND SCALED_MM_ARCHS
)
set
(
SRCS
set
(
SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
...
@@ -437,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -437,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm120 for archs:
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm120 for archs:
${
SCALED_MM_ARCHS
}
"
)
else
()
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND SCALED_MM_ARCHS
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND SCALED_MM_ARCHS
)
message
(
STATUS
"Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
message
(
STATUS
"Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"later if you intend on running FP8 quantized models on "
...
@@ -452,7 +459,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -452,7 +459,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
# require CUDA 12.8 or later
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"10.0a;10.1a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"10.0a;10.1a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND SCALED_MM_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND SCALED_MM_ARCHS
)
set
(
SRCS
set
(
SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
...
@@ -467,7 +474,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -467,7 +474,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
list
(
APPEND SCALED_MM_3X_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm100 for archs:
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building scaled_mm_c3x_sm100 for archs:
${
SCALED_MM_ARCHS
}
"
)
else
()
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND SCALED_MM_ARCHS
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND SCALED_MM_ARCHS
)
message
(
STATUS
"Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
message
(
STATUS
"Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"later if you intend on running FP8 quantized models on "
...
@@ -510,7 +517,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -510,7 +517,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper).
# require CUDA 12.2 or later (and only work on Hopper).
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"9.0a;"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"9.0a;"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.2 AND SCALED_MM_ARCHS
)
set
(
SRCS
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu"
)
set
(
SRCS
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu"
)
set_gencode_flags_for_srcs
(
set_gencode_flags_for_srcs
(
SRCS
"
${
SRCS
}
"
SRCS
"
${
SRCS
}
"
...
@@ -519,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -519,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_SPARSE_SCALED_MM_C3X=1"
)
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_SPARSE_SCALED_MM_C3X=1"
)
message
(
STATUS
"Building sparse_scaled_mm_c3x for archs:
${
SCALED_MM_ARCHS
}
"
)
message
(
STATUS
"Building sparse_scaled_mm_c3x for archs:
${
SCALED_MM_ARCHS
}
"
)
else
()
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.2 AND SCALED_MM_ARCHS
)
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.2 AND SCALED_MM_ARCHS
)
message
(
STATUS
"Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
message
(
STATUS
"Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper."
)
"if you intend on running FP8 sparse quantized models on Hopper."
)
...
@@ -531,7 +538,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -531,7 +538,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# FP4 Archs and flags
# FP4 Archs and flags
cuda_archs_loose_intersection
(
FP4_ARCHS
"10.0a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
FP4_ARCHS
"10.0a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND FP4_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND FP4_ARCHS
)
set
(
SRCS
set
(
SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
...
@@ -552,7 +559,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -552,7 +559,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MLA Archs and flags
# CUTLASS MLA Archs and flags
cuda_archs_loose_intersection
(
MLA_ARCHS
"10.0a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
MLA_ARCHS
"10.0a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.8 AND MLA_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.8 AND MLA_ARCHS
)
set
(
SRCS
set
(
SRCS
"csrc/attention/mla/cutlass_mla_kernels.cu"
)
"csrc/attention/mla/cutlass_mla_kernels.cu"
)
set_gencode_flags_for_srcs
(
set_gencode_flags_for_srcs
(
...
@@ -641,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -641,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The machete kernels only work on hopper and require CUDA 12.0 or later.
# The machete kernels only work on hopper and require CUDA 12.0 or later.
# Only build Machete kernels if we are building for something compatible with sm90a
# Only build Machete kernels if we are building for something compatible with sm90a
cuda_archs_loose_intersection
(
MACHETE_ARCHS
"9.0a"
"
${
CUDA_ARCHS
}
"
)
cuda_archs_loose_intersection
(
MACHETE_ARCHS
"9.0a"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.0 AND MACHETE_ARCHS
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.0 AND MACHETE_ARCHS
)
#
#
# For the Machete kernels we automatically generate sources for various
# For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# preselected input type pairs and schedules.
...
@@ -693,7 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
...
@@ -693,7 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message
(
STATUS
"Building Machete kernels for archs:
${
MACHETE_ARCHS
}
"
)
message
(
STATUS
"Building Machete kernels for archs:
${
MACHETE_ARCHS
}
"
)
else
()
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER 12.0
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER
_EQUAL
12.0
AND MACHETE_ARCHS
)
AND MACHETE_ARCHS
)
message
(
STATUS
"Not building Machete kernels as CUDA Compiler version is "
message
(
STATUS
"Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment