Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
fccbfa37
Unverified
Commit
fccbfa37
authored
Dec 14, 2024
by
Yineng Zhang
Committed by
GitHub
Dec 14, 2024
Browse files
format: add clang-format for sgl-kernel (#2483)
parent
2f9bd0fa
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
27 deletions
+28
-27
sgl-kernel/.clang-format
sgl-kernel/.clang-format
+8
-0
sgl-kernel/Makefile
sgl-kernel/Makefile
+4
-1
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
+4
-6
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
+12
-20
No files found.
sgl-kernel/.clang-format
0 → 100644
View file @
fccbfa37
BasedOnStyle: Google
IndentWidth: 2
ColumnLimit: 120
AllowShortFunctionsOnASingleLine: Empty
DerivePointerAlignment: false
PointerAlignment: Left
NamespaceIndentation: None
SortIncludes: true
sgl-kernel/Makefile
View file @
fccbfa37
.PHONY
:
tree ln install build clean test
.PHONY
:
tree ln install build clean test
format
tree
:
@
tree
--prune
-I
"__pycache__|*.egg-info|*.so|build"
...
...
@@ -17,3 +17,6 @@ clean:
test
:
@
pytest tests/
format
:
@
find src tests
-name
'*.cc'
-o
-name
'*.cu'
-o
-name
'*.cuh'
-o
-name
'*.h'
| xargs clang-format
-i
&&
find src tests
-name
'*.py'
| xargs isort
&&
find src tests
-name
'*.py'
| xargs black
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
View file @
fccbfa37
...
...
@@ -2,12 +2,10 @@
torch
::
Tensor
warp_reduce_cuda
(
torch
::
Tensor
input
);
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
torch
::
Tensor
warp_reduce
(
torch
::
Tensor
input
)
{
...
...
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
View file @
fccbfa37
...
...
@@ -25,34 +25,28 @@ __device__ __forceinline__ scalar_t blockReduceSum(scalar_t val) {
int
lane
=
threadIdx
.
x
%
32
;
int
wid
=
threadIdx
.
x
/
32
;
val
=
warpReduceSum
(
val
);
// First reduce within warp
val
=
warpReduceSum
(
val
);
// First reduce within warp
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
// Write reduced value to shared memory
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
// Write reduced value to shared memory
__syncthreads
();
// Wait for all partial reductions
__syncthreads
();
// Wait for all partial reductions
// Read from shared memory only if that warp existed
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32
))
?
shared
[
lane
]
:
0
;
if
(
wid
==
0
)
val
=
warpReduceSum
(
val
);
// Final reduce within first warp
if
(
wid
==
0
)
val
=
warpReduceSum
(
val
);
// Final reduce within first warp
return
val
;
}
template
<
typename
scalar_t
>
__global__
void
warp_reduce_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
input
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
output
,
int
N
)
{
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
input
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
output
,
int
N
)
{
scalar_t
sum
=
0
;
// Grid-stride loop
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
sum
+=
input
[
i
];
}
...
...
@@ -84,13 +78,11 @@ torch::Tensor warp_reduce_cuda(torch::Tensor input) {
// Allocate output tensor for partial sums
auto
output
=
torch
::
empty
({
blocks
},
input
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"warp_reduce_cuda"
,
([
&
]
{
warp_reduce_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
input
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
output
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
N
);
}));
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"warp_reduce_cuda"
,
([
&
]
{
warp_reduce_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
input
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
output
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
N
);
}));
// Sum the partial results
return
output
.
sum
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment