Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
a53454c5
Unverified
Commit
a53454c5
authored
Jan 16, 2025
by
Yineng Zhang
Committed by
GitHub
Jan 16, 2025
Browse files
fix: sgl-kernel link cuda (#2906)
parent
6cb3974e
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
44 additions
and
48 deletions
+44
-48
sgl-kernel/build.sh
sgl-kernel/build.sh
+2
-0
sgl-kernel/pyproject.toml
sgl-kernel/pyproject.toml
+1
-1
sgl-kernel/setup.py
sgl-kernel/setup.py
+1
-1
sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
+38
-43
sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
+2
-3
No files found.
sgl-kernel/build.sh
View file @
a53454c5
...
...
@@ -11,6 +11,8 @@ docker run --rm \
${
PYTHON_ROOT_PATH
}
/bin/pip install --no-cache-dir torch==2.4.0 --index-url https://download.pytorch.org/whl/cu
${
CUDA_VERSION
//.
}
&&
\
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX' &&
\
export CUDA_VERSION=
${
CUDA_VERSION
}
&&
\
mkdir -p /usr/lib/x86_64-linux-gnu/ &&
\
ln -s /usr/local/cuda-
${
CUDA_VERSION
}
/targets/x86_64-linux/lib/stubs/libcuda.so /usr/lib/x86_64-linux-gnu/libcuda.so &&
\
cd /sgl-kernel &&
\
${
PYTHON_ROOT_PATH
}
/bin/python setup.py bdist_wheel
"
sgl-kernel/pyproject.toml
View file @
a53454c5
...
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name
=
"sgl-kernel"
version
=
"0.0.2.post1
3
"
version
=
"0.0.2.post1
4
"
description
=
"Kernel Library for SGLang"
readme
=
"README.md"
requires-python
=
">=3.8"
...
...
sgl-kernel/setup.py
View file @
a53454c5
...
...
@@ -41,7 +41,7 @@ nvcc_flags = [
]
cxx_flags
=
[
"-O3"
]
libraries
=
[
"c10"
,
"torch"
,
"torch_python"
,
"cuda"
]
extra_link_args
=
[
"-Wl,-rpath,$ORIGIN/../../torch/lib"
]
extra_link_args
=
[
"-Wl,-rpath,$ORIGIN/../../torch/lib"
,
"-L/usr/lib/x86_64-linux-gnu"
]
ext_modules
=
[
CUDAExtension
(
name
=
"sgl_kernel.ops._kernels"
,
...
...
sgl-kernel/src/sgl-kernel/csrc/sampling_scaling_penalties.cu
View file @
a53454c5
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THCAtomics.cuh>
#include "utils.hpp"
#include "vectorization.cuh"
template
<
typename
scalar_t
>
__global__
void
sampling_scaling_penalties_kernel
(
const
scalar_t
*
logits
,
const
scalar_t
*
scaling_penalties
,
scalar_t
*
output
,
const
int32_t
numel
)
{
const
int32_t
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int32_t
stride
=
blockDim
.
x
*
gridDim
.
x
;
__global__
void
sampling_scaling_penalties_kernel
(
const
scalar_t
*
logits
,
const
scalar_t
*
scaling_penalties
,
scalar_t
*
output
,
const
int32_t
numel
)
{
const
int32_t
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int32_t
stride
=
blockDim
.
x
*
gridDim
.
x
;
auto
const
*
vectorized_logits
=
reinterpret_cast
<
vec4_t
<
scalar_t
>
const
*>
(
logits
);
auto
const
*
vectorized_penalties
=
reinterpret_cast
<
vec4_t
<
scalar_t
>
const
*>
(
scaling_penalties
);
auto
*
vectorized_output
=
reinterpret_cast
<
vec4_t
<
scalar_t
>*>
(
output
);
auto
const
*
vectorized_logits
=
reinterpret_cast
<
vec4_t
<
scalar_t
>
const
*>
(
logits
);
auto
const
*
vectorized_penalties
=
reinterpret_cast
<
vec4_t
<
scalar_t
>
const
*>
(
scaling_penalties
);
auto
*
vectorized_output
=
reinterpret_cast
<
vec4_t
<
scalar_t
>*>
(
output
);
const
int32_t
num_vec_elems
=
numel
>>
2
;
const
int32_t
num_vec_elems
=
numel
>>
2
;
#pragma unroll 4
for
(
int32_t
i
=
tid
;
i
<
num_vec_elems
;
i
+=
stride
)
{
vec4_t
<
scalar_t
>
logits_vec
=
vectorized_logits
[
i
];
vec4_t
<
scalar_t
>
penalties_vec
=
vectorized_penalties
[
i
];
vec4_t
<
scalar_t
>
out_vec
;
out_vec
.
x
=
logits_vec
.
x
>
0
?
logits_vec
.
x
/
penalties_vec
.
x
:
logits_vec
.
x
*
penalties_vec
.
x
;
out_vec
.
y
=
logits_vec
.
y
>
0
?
logits_vec
.
y
/
penalties_vec
.
y
:
logits_vec
.
y
*
penalties_vec
.
y
;
out_vec
.
z
=
logits_vec
.
z
>
0
?
logits_vec
.
z
/
penalties_vec
.
z
:
logits_vec
.
z
*
penalties_vec
.
z
;
out_vec
.
w
=
logits_vec
.
w
>
0
?
logits_vec
.
w
/
penalties_vec
.
w
:
logits_vec
.
w
*
penalties_vec
.
w
;
vectorized_output
[
i
]
=
out_vec
;
}
const
int32_t
start_idx
=
num_vec_elems
*
4
;
for
(
int32_t
i
=
start_idx
+
tid
;
i
<
numel
;
i
+=
stride
)
{
scalar_t
logit
=
logits
[
i
];
scalar_t
penalty
=
scaling_penalties
[
i
];
output
[
i
]
=
logit
>
0
?
logit
/
penalty
:
logit
*
penalty
;
}
for
(
int32_t
i
=
tid
;
i
<
num_vec_elems
;
i
+=
stride
)
{
vec4_t
<
scalar_t
>
logits_vec
=
vectorized_logits
[
i
];
vec4_t
<
scalar_t
>
penalties_vec
=
vectorized_penalties
[
i
];
vec4_t
<
scalar_t
>
out_vec
;
out_vec
.
x
=
logits_vec
.
x
>
0
?
logits_vec
.
x
/
penalties_vec
.
x
:
logits_vec
.
x
*
penalties_vec
.
x
;
out_vec
.
y
=
logits_vec
.
y
>
0
?
logits_vec
.
y
/
penalties_vec
.
y
:
logits_vec
.
y
*
penalties_vec
.
y
;
out_vec
.
z
=
logits_vec
.
z
>
0
?
logits_vec
.
z
/
penalties_vec
.
z
:
logits_vec
.
z
*
penalties_vec
.
z
;
out_vec
.
w
=
logits_vec
.
w
>
0
?
logits_vec
.
w
/
penalties_vec
.
w
:
logits_vec
.
w
*
penalties_vec
.
w
;
vectorized_output
[
i
]
=
out_vec
;
}
const
int32_t
start_idx
=
num_vec_elems
*
4
;
for
(
int32_t
i
=
start_idx
+
tid
;
i
<
numel
;
i
+=
stride
)
{
scalar_t
logit
=
logits
[
i
];
scalar_t
penalty
=
scaling_penalties
[
i
];
output
[
i
]
=
logit
>
0
?
logit
/
penalty
:
logit
*
penalty
;
}
}
torch
::
Tensor
sampling_scaling_penalties
(
const
torch
::
Tensor
&
logits
,
const
torch
::
Tensor
&
scaling_penalties
)
{
auto
output
=
torch
::
empty_like
(
logits
);
const
auto
numel
=
logits
.
numel
();
const
int
threads
=
512
;
auto
output
=
torch
::
empty_like
(
logits
);
const
auto
numel
=
logits
.
numel
();
const
int
threads
=
512
;
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
AT_DISPATCH_FLOATING_TYPES_AND2
(
at
::
ScalarType
::
Half
,
at
::
ScalarType
::
BFloat16
,
logits
.
scalar_type
(),
"sampling_scaling_penalties_kernel"
,
([
&
]
{
AT_DISPATCH_FLOATING_TYPES_AND2
(
at
::
ScalarType
::
Half
,
at
::
ScalarType
::
BFloat16
,
logits
.
scalar_type
(),
"sampling_scaling_penalties_kernel"
,
([
&
]
{
const
int
blocks
=
(
numel
+
threads
*
4
-
1
)
/
(
threads
*
4
);
sampling_scaling_penalties_kernel
<
scalar_t
><<<
blocks
,
threads
,
0
,
stream
>>>
(
logits
.
data_ptr
<
scalar_t
>
(),
scaling_penalties
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
(),
numel
);
}));
logits
.
data_ptr
<
scalar_t
>
(),
scaling_penalties
.
data_ptr
<
scalar_t
>
(),
output
.
data_ptr
<
scalar_t
>
(),
numel
);
}));
return
output
;
return
output
;
}
sgl-kernel/src/sgl-kernel/csrc/vectorization.cuh
View file @
a53454c5
...
...
@@ -6,8 +6,8 @@
// Include both AMD and NVIDIA fp8 types to avoid circular import
// TODO(luka/varun) use FP8_TYPE instead after refactoring
#include <c10/util/Float8_e4m3fnuz.h>
#include <c10/util/Float8_e4m3fn.h>
#include <c10/util/Float8_e4m3fnuz.h>
// Vectorization containers
template
<
typename
scalar_t
>
...
...
@@ -20,8 +20,7 @@ struct __align__(8) vec4_t {
template
<
typename
quant_type_t
>
struct
__align__
(
4
)
q8x4_t
{
static_assert
(
std
::
is_same_v
<
quant_type_t
,
int8_t
>
||
std
::
is_same_v
<
quant_type_t
,
c10
::
Float8_e4m3fn
>
||
static_assert
(
std
::
is_same_v
<
quant_type_t
,
int8_t
>
||
std
::
is_same_v
<
quant_type_t
,
c10
::
Float8_e4m3fn
>
||
std
::
is_same_v
<
quant_type_t
,
c10
::
Float8_e4m3fnuz
>
);
quant_type_t
x
;
quant_type_t
y
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment