Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
fccbfa37
"vscode:/vscode.git/clone" did not exist on "018df054f25349e68c14654ea4b01a2db042df8c"
Unverified
Commit
fccbfa37
authored
Dec 14, 2024
by
Yineng Zhang
Committed by
GitHub
Dec 14, 2024
Browse files
format: add clang-format for sgl-kernel (#2483)
parent
2f9bd0fa
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
27 deletions
+28
-27
sgl-kernel/.clang-format
sgl-kernel/.clang-format
+8
-0
sgl-kernel/Makefile
sgl-kernel/Makefile
+4
-1
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
+4
-6
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
+12
-20
No files found.
sgl-kernel/.clang-format
0 → 100644
View file @
fccbfa37
BasedOnStyle: Google
IndentWidth: 2
ColumnLimit: 120
AllowShortFunctionsOnASingleLine: Empty
DerivePointerAlignment: false
PointerAlignment: Left
NamespaceIndentation: None
SortIncludes: true
sgl-kernel/Makefile
View file @
fccbfa37
.PHONY
:
tree ln install build clean test
.PHONY
:
tree ln install build clean test
format
tree
:
tree
:
@
tree
--prune
-I
"__pycache__|*.egg-info|*.so|build"
@
tree
--prune
-I
"__pycache__|*.egg-info|*.so|build"
...
@@ -17,3 +17,6 @@ clean:
...
@@ -17,3 +17,6 @@ clean:
test
:
test
:
@
pytest tests/
@
pytest tests/
format
:
@
find src tests
-name
'*.cc'
-o
-name
'*.cu'
-o
-name
'*.cuh'
-o
-name
'*.h'
| xargs clang-format
-i
&&
find src tests
-name
'*.py'
| xargs isort
&&
find src tests
-name
'*.py'
| xargs black
sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc
View file @
fccbfa37
...
@@ -2,10 +2,8 @@
...
@@ -2,10 +2,8 @@
torch
::
Tensor
warp_reduce_cuda
(
torch
::
Tensor
input
);
torch
::
Tensor
warp_reduce_cuda
(
torch
::
Tensor
input
);
#define CHECK_CUDA(x) \
#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
CHECK_CONTIGUOUS(x)
...
...
sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu
View file @
fccbfa37
...
@@ -27,32 +27,26 @@ __device__ __forceinline__ scalar_t blockReduceSum(scalar_t val) {
...
@@ -27,32 +27,26 @@ __device__ __forceinline__ scalar_t blockReduceSum(scalar_t val) {
val
=
warpReduceSum
(
val
);
// First reduce within warp
val
=
warpReduceSum
(
val
);
// First reduce within warp
if
(
lane
==
0
)
if
(
lane
==
0
)
shared
[
wid
]
=
val
;
// Write reduced value to shared memory
shared
[
wid
]
=
val
;
// Write reduced value to shared memory
__syncthreads
();
// Wait for all partial reductions
__syncthreads
();
// Wait for all partial reductions
// Read from shared memory only if that warp existed
// Read from shared memory only if that warp existed
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32
))
?
shared
[
lane
]
:
0
;
val
=
(
threadIdx
.
x
<
(
blockDim
.
x
/
32
))
?
shared
[
lane
]
:
0
;
if
(
wid
==
0
)
if
(
wid
==
0
)
val
=
warpReduceSum
(
val
);
// Final reduce within first warp
val
=
warpReduceSum
(
val
);
// Final reduce within first warp
return
val
;
return
val
;
}
}
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
__global__
void
warp_reduce_cuda_kernel
(
__global__
void
warp_reduce_cuda_kernel
(
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
const
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
input
,
input
,
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
output
,
int
N
)
{
torch
::
PackedTensorAccessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
output
,
int
N
)
{
scalar_t
sum
=
0
;
scalar_t
sum
=
0
;
// Grid-stride loop
// Grid-stride loop
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
N
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
sum
+=
input
[
i
];
sum
+=
input
[
i
];
}
}
...
@@ -84,12 +78,10 @@ torch::Tensor warp_reduce_cuda(torch::Tensor input) {
...
@@ -84,12 +78,10 @@ torch::Tensor warp_reduce_cuda(torch::Tensor input) {
// Allocate output tensor for partial sums
// Allocate output tensor for partial sums
auto
output
=
torch
::
empty
({
blocks
},
input
.
options
());
auto
output
=
torch
::
empty
({
blocks
},
input
.
options
());
AT_DISPATCH_FLOATING_TYPES
(
AT_DISPATCH_FLOATING_TYPES
(
input
.
scalar_type
(),
"warp_reduce_cuda"
,
([
&
]
{
input
.
scalar_type
(),
"warp_reduce_cuda"
,
([
&
]
{
warp_reduce_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
warp_reduce_cuda_kernel
<
scalar_t
><<<
blocks
,
threads
>>>
(
input
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
input
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
output
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
output
.
packed_accessor32
<
scalar_t
,
1
,
torch
::
RestrictPtrTraits
>
(),
N
);
N
);
}));
}));
// Sum the partial results
// Sum the partial results
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment