Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
ec3ee028
Unverified
Commit
ec3ee028
authored
Mar 28, 2025
by
Yineng Zhang
Committed by
GitHub
Mar 28, 2025
Browse files
fix sgl-kernel cu118 build (#4872)
parent
92941ce7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
1 deletion
+12
-1
sgl-kernel/build.sh
sgl-kernel/build.sh
+2
-1
sgl-kernel/csrc/gemm/awq_kernel.cu
sgl-kernel/csrc/gemm/awq_kernel.cu
+5
-0
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
+5
-0
No files found.
sgl-kernel/build.sh
View file @
ec3ee028
...
...
@@ -25,5 +25,6 @@ docker run --rm \
ln -s /usr/local/cuda-
${
CUDA_VERSION
}
/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so &&
\
cd /sgl-kernel &&
\
ls -la
${
PYTHON_ROOT_PATH
}
/lib/python
${
PYTHON_VERSION
}
/site-packages/wheel/ &&
\
PYTHONPATH=
${
PYTHON_ROOT_PATH
}
/lib/python
${
PYTHON_VERSION
}
/site-packages
${
PYTHON_ROOT_PATH
}
/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always
PYTHONPATH=
${
PYTHON_ROOT_PATH
}
/lib/python
${
PYTHON_VERSION
}
/site-packages
${
PYTHON_ROOT_PATH
}
/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always &&
\
./rename_wheels.sh
"
sgl-kernel/csrc/gemm/awq_kernel.cu
View file @
ec3ee028
// Adapted from
// https://github.com/vllm-project/vllm/blob/eb59b5a6cba6727d3727c0372258db9002f687c1/csrc/quantization/awq/gemm_kernels.cu#L350
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <torch/all.h>
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
...
...
@@ -79,6 +80,7 @@ __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) {
}
__device__
uint4
dequantize_s4_to_bf16x2
(
uint32_t
const
&
source
)
{
#if CUDA_VERSION >= 12000
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
uint4
result
;
uint32_t
*
h
=
reinterpret_cast
<
uint32_t
*>
(
&
result
);
...
...
@@ -118,6 +120,7 @@ __device__ uint4 dequantize_s4_to_bf16x2(uint32_t const& source) {
assert
(
false
);
return
{};
#endif
#endif
}
template
<
typename
OutputT
>
...
...
@@ -128,6 +131,7 @@ __global__ void __launch_bounds__(256) dequantize_weights(
OutputT
*
__restrict__
output
,
int
group_size
,
int
qweight_cols
)
{
#if CUDA_VERSION >= 12000
int
col
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
row
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
...
...
@@ -174,6 +178,7 @@ __global__ void __launch_bounds__(256) dequantize_weights(
static_assert
(
sizeof
(
uint4
)
==
8
*
sizeof
(
OutputT
),
"Memory layout mismatch"
);
*
reinterpret_cast
<
uint4
*>
(
output_ptr
)
=
weight_raw
;
}
#endif
}
torch
::
Tensor
awq_dequantize
(
torch
::
Tensor
qweight
,
torch
::
Tensor
scales
,
torch
::
Tensor
qzeros
)
{
...
...
sgl-kernel/csrc/gemm/nvfp4_quant_kernels.cu
View file @
ec3ee028
...
...
@@ -15,6 +15,7 @@ limitations under the License.
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
...
...
@@ -56,6 +57,7 @@ constexpr int CVT_FP4_SF_VEC_SIZE = 16;
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
inline
__device__
uint32_t
fp32_vec_to_e2m1
(
float
(
&
array
)[
8
])
{
// PTX instructions used here requires sm100a.
#if CUDA_VERSION >= 12080
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
uint32_t
val
;
asm
volatile
(
...
...
@@ -83,11 +85,13 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
#else
return
0
;
#endif
#endif
}
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
inline
__device__
uint32_t
fp32_vec_to_e2m1
(
float2
(
&
array
)[
4
])
{
// PTX instructions used here requires sm100a.
#if CUDA_VERSION >= 12080
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
uint32_t
val
;
asm
volatile
(
...
...
@@ -115,6 +119,7 @@ inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
#else
return
0
;
#endif
#endif
}
// Fast reciprocal.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment