Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b6b57fc2
"vscode:/vscode.git/clone" did not exist on "c54f36f0872f0cf131a0880cb80b69cc9ad8f346"
Unverified
Commit
b6b57fc2
authored
Dec 31, 2024
by
Yineng Zhang
Committed by
GitHub
Dec 31, 2024
Browse files
minor: cleanup sgl-kernel (#2679)
parent
b4403985
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
0 additions
and
145 deletions
+0
-145
sgl-kernel/CMakeLists.txt
sgl-kernel/CMakeLists.txt
+0
-1
sgl-kernel/setup.py
sgl-kernel/setup.py
+0
-37
sgl-kernel/src/sgl-kernel/__init__.py
sgl-kernel/src/sgl-kernel/__init__.py
+0
-2
sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
+0
-10
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
+0
-90
sgl-kernel/src/sgl-kernel/ops/__init__.py
sgl-kernel/src/sgl-kernel/ops/__init__.py
+0
-5
No files found.
sgl-kernel/CMakeLists.txt
View file @
b6b57fc2
...
@@ -28,7 +28,6 @@ find_package(Torch REQUIRED)
...
@@ -28,7 +28,6 @@ find_package(Torch REQUIRED)
# Warp Reduce library
# Warp Reduce library
add_library
(
_kernels SHARED
add_library
(
_kernels SHARED
src/sgl-kernel/csrc/warp_reduce_kernel.cu
src/sgl-kernel/csrc/trt_reduce_internal.cu
src/sgl-kernel/csrc/trt_reduce_internal.cu
src/sgl-kernel/csrc/trt_reduce_kernel.cu
src/sgl-kernel/csrc/trt_reduce_kernel.cu
src/sgl-kernel/csrc/moe_align_kernel.cu
src/sgl-kernel/csrc/moe_align_kernel.cu
...
...
sgl-kernel/setup.py
View file @
b6b57fc2
import
os
import
shutil
import
zipfile
from
pathlib
import
Path
from
pathlib
import
Path
from
setuptools
import
setup
from
setuptools
import
setup
...
@@ -16,39 +13,6 @@ def get_version():
...
@@ -16,39 +13,6 @@ def get_version():
return
line
.
split
(
"="
)[
1
].
strip
().
strip
(
'"'
)
return
line
.
split
(
"="
)[
1
].
strip
().
strip
(
'"'
)
def
rename_wheel
():
if
not
os
.
environ
.
get
(
"CUDA_VERSION"
):
return
cuda_version
=
os
.
environ
[
"CUDA_VERSION"
].
replace
(
"."
,
""
)
base_version
=
get_version
()
wheel_dir
=
Path
(
"dist"
)
old_wheel
=
next
(
wheel_dir
.
glob
(
"*.whl"
))
tmp_dir
=
wheel_dir
/
"tmp"
tmp_dir
.
mkdir
(
exist_ok
=
True
)
with
zipfile
.
ZipFile
(
old_wheel
,
"r"
)
as
zip_ref
:
zip_ref
.
extractall
(
tmp_dir
)
old_info
=
tmp_dir
/
f
"sgl_kernel-
{
base_version
}
.dist-info"
new_info
=
tmp_dir
/
f
"sgl_kernel-
{
base_version
}
.post0+cu
{
cuda_version
}
.dist-info"
old_info
.
rename
(
new_info
)
platform
=
"manylinux2014_x86_64"
new_wheel
=
wheel_dir
/
old_wheel
.
name
.
replace
(
"linux_x86_64"
,
platform
)
new_wheel
=
wheel_dir
/
new_wheel
.
name
.
replace
(
base_version
,
f
"
{
base_version
}
.post0+cu
{
cuda_version
}
"
)
with
zipfile
.
ZipFile
(
new_wheel
,
"w"
,
zipfile
.
ZIP_DEFLATED
)
as
new_zip
:
for
file_path
in
tmp_dir
.
rglob
(
"*"
):
if
file_path
.
is_file
():
new_zip
.
write
(
file_path
,
file_path
.
relative_to
(
tmp_dir
))
old_wheel
.
unlink
()
shutil
.
rmtree
(
tmp_dir
)
def
update_wheel_platform_tag
():
def
update_wheel_platform_tag
():
wheel_dir
=
Path
(
"dist"
)
wheel_dir
=
Path
(
"dist"
)
old_wheel
=
next
(
wheel_dir
.
glob
(
"*.whl"
))
old_wheel
=
next
(
wheel_dir
.
glob
(
"*.whl"
))
...
@@ -81,7 +45,6 @@ ext_modules = [
...
@@ -81,7 +45,6 @@ ext_modules = [
CUDAExtension
(
CUDAExtension
(
name
=
"sgl_kernel.ops._kernels"
,
name
=
"sgl_kernel.ops._kernels"
,
sources
=
[
sources
=
[
"src/sgl-kernel/csrc/warp_reduce_kernel.cu"
,
"src/sgl-kernel/csrc/trt_reduce_internal.cu"
,
"src/sgl-kernel/csrc/trt_reduce_internal.cu"
,
"src/sgl-kernel/csrc/trt_reduce_kernel.cu"
,
"src/sgl-kernel/csrc/trt_reduce_kernel.cu"
,
"src/sgl-kernel/csrc/moe_align_kernel.cu"
,
"src/sgl-kernel/csrc/moe_align_kernel.cu"
,
...
...
sgl-kernel/src/sgl-kernel/__init__.py
View file @
b6b57fc2
...
@@ -3,12 +3,10 @@ from sgl_kernel.ops import (
...
@@ -3,12 +3,10 @@ from sgl_kernel.ops import (
custom_reduce
,
custom_reduce
,
init_custom_reduce
,
init_custom_reduce
,
moe_align_block_size
,
moe_align_block_size
,
warp_reduce
,
)
)
__all__
=
[
__all__
=
[
"moe_align_block_size"
,
"moe_align_block_size"
,
"warp_reduce"
,
"init_custom_reduce"
,
"init_custom_reduce"
,
"custom_dispose"
,
"custom_dispose"
,
"custom_reduce"
,
"custom_reduce"
,
...
...
sgl-kernel/src/sgl-kernel/csrc/sgl_kernel_ops.cu
View file @
b6b57fc2
#include "utils.hpp"
#include "utils.hpp"
// warp_reduce
torch
::
Tensor
warp_reduce_cuda
(
torch
::
Tensor
input
);
torch
::
Tensor
warp_reduce
(
torch
::
Tensor
input
)
{
CHECK_CUDA_INPUT
(
input
);
return
warp_reduce_cuda
(
input
);
}
// trt_reduce
// trt_reduce
using
fptr_t
=
int64_t
;
using
fptr_t
=
int64_t
;
fptr_t
init_custom_ar
(
int64_t
rank_id
,
int64_t
world_size
,
const
std
::
vector
<
fptr_t
>&
buffers
,
fptr_t
init_custom_ar
(
int64_t
rank_id
,
int64_t
world_size
,
const
std
::
vector
<
fptr_t
>&
buffers
,
...
@@ -21,8 +13,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
...
@@ -21,8 +13,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t b
torch
::
Tensor
token_cnts_buffer
,
torch
::
Tensor
cumsum_buffer
);
torch
::
Tensor
token_cnts_buffer
,
torch
::
Tensor
cumsum_buffer
);
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
// warp_reduce
m
.
def
(
"reduce"
,
&
warp_reduce
,
"Warp Reduce (CUDA)"
);
// trt_reduce
// trt_reduce
m
.
def
(
"init_custom_ar"
,
&
init_custom_ar
,
"init custom allreduce meta (CUDA)"
);
m
.
def
(
"init_custom_ar"
,
&
init_custom_ar
,
"init custom allreduce meta (CUDA)"
);
m
.
def
(
"dispose"
,
&
dispose
,
"dispose custom allreduce meta"
);
m
.
def
(
"dispose"
,
&
dispose
,
"dispose custom allreduce meta"
);
...
...
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
deleted
100644 → 0
View file @
b4403985
#include <cuda.h>
#include <cuda_runtime.h>
#include "utils.hpp"
#define FINAL_MASK 0xffffffff
#define BLOCK_SIZE 256
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
add
(
scalar_t
a
,
scalar_t
b
)
{
return
a
+
b
;
}
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
warpReduceSum
(
scalar_t
val
)
{
#pragma unroll
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
val
+=
__shfl_down_sync
(
FINAL_MASK
,
val
,
offset
);
}
return
val
;
}
template
<
typename
scalar_t
>
__device__
__forceinline__
scalar_t
blockReduceSum
(
scalar_t
val
)
{
__shared__
scalar_t
shared
[
32
];
int
lane
=
threadIdx
.
x
%
32
;
int
wid
=
threadIdx
.
x
/
32
;
val
=
warpReduceSum
(
val
);
// First reduce within warp
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
// Write reduced value to shared memory
__syncthreads
();
// Wait for all partial reductions
// Read from shared memory only if that warp existed
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32
))
?
shared
[
lane
]
:
0
;
if
(
wid
==
0
)
val
=
warpReduceSum
(
val
);
// Final reduce within first warp
return
val
;
}
template
<
typename
scalar_t
>
__global__
void
warp_reduce_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
input
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
output
,
int
N
)
{
scalar_t
sum
=
0
;
// Grid-stride loop
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
sum
+=
input
[
i
];
}
// Perform block-wide reduction
sum
=
blockReduceSum
(
sum
);
// Write result for this block to global memory
if
(
threadIdx
.
x
==
0
)
{
output
[
blockIdx
.
x
]
=
sum
;
}
}
torch
::
Tensor
warp_reduce_cuda
(
torch
::
Tensor
input
)
{
// Input validation
TORCH_CHECK
(
input
.
dim
()
==
1
,
"1D tensor expected"
);
TORCH_CHECK
(
input
.
is_cuda
(),
"CUDA tensor expected"
);
const
auto
N
=
input
.
size
(
0
);
// Handle empty tensor
if
(
N
==
0
)
{
return
torch
::
zeros
({
1
},
input
.
options
());
}
// Calculate grid dimensions
const
int
threads
=
BLOCK_SIZE
;
const
int
blocks
=
(
N
+
threads
-
1
)
/
threads
;
// Allocate output tensor for partial sums
auto
output
=
torch
::
empty
({
blocks
},
input
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"warp_reduce_cuda"
,
([
&
]
{
warp_reduce_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
input
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
output
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
N
);
}));
// Sum the partial results
return
output
.
sum
();
}
sgl-kernel/src/sgl-kernel/ops/__init__.py
View file @
b6b57fc2
...
@@ -2,11 +2,6 @@ from sgl_kernel.ops._kernels import all_reduce as _all_reduce
...
@@ -2,11 +2,6 @@ from sgl_kernel.ops._kernels import all_reduce as _all_reduce
from
sgl_kernel.ops._kernels
import
dispose
as
_dispose
from
sgl_kernel.ops._kernels
import
dispose
as
_dispose
from
sgl_kernel.ops._kernels
import
init_custom_ar
as
_init_custom_ar
from
sgl_kernel.ops._kernels
import
init_custom_ar
as
_init_custom_ar
from
sgl_kernel.ops._kernels
import
moe_align_block_size
as
_moe_align_block_size
from
sgl_kernel.ops._kernels
import
moe_align_block_size
as
_moe_align_block_size
from
sgl_kernel.ops._kernels
import
reduce
as
_reduce
def
warp_reduce
(
input_tensor
):
return
_reduce
(
input_tensor
)
def
init_custom_reduce
(
rank_id
,
num_devices
,
buffers
,
barrier_in
,
barrier_out
):
def
init_custom_reduce
(
rank_id
,
num_devices
,
buffers
,
barrier_in
,
barrier_out
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment