Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
2ce87935
Unverified
Commit
2ce87935
authored
May 11, 2025
by
applesaucethebun
Committed by
GitHub
May 11, 2025
Browse files
Add typo checker in pre-commit (#6179)
Co-authored-by:
Brayden Zhong
<
b8zhong@uwaterloo.ca
>
parent
de167cf5
Changes
99
Show whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
26 additions
and
26 deletions
+26
-26
sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
+1
-1
sgl-kernel/csrc/cpu/common.h
sgl-kernel/csrc/cpu/common.h
+1
-1
sgl-kernel/csrc/cpu/decode.cpp
sgl-kernel/csrc/cpu/decode.cpp
+2
-2
sgl-kernel/csrc/cpu/extend.cpp
sgl-kernel/csrc/cpu/extend.cpp
+2
-2
sgl-kernel/csrc/cpu/gemm.h
sgl-kernel/csrc/cpu/gemm.h
+1
-1
sgl-kernel/csrc/cpu/gemm_int8.cpp
sgl-kernel/csrc/cpu/gemm_int8.cpp
+1
-1
sgl-kernel/csrc/speculative/packbit.cu
sgl-kernel/csrc/speculative/packbit.cu
+1
-1
sgl-kernel/include/sgl_kernel_torch_shim.h
sgl-kernel/include/sgl_kernel_torch_shim.h
+3
-3
sgl-kernel/python/sgl_kernel/elementwise.py
sgl-kernel/python/sgl_kernel/elementwise.py
+1
-1
sgl-kernel/python/sgl_kernel/flash_attn.py
sgl-kernel/python/sgl_kernel/flash_attn.py
+1
-1
sgl-kernel/python/sgl_kernel/moe.py
sgl-kernel/python/sgl_kernel/moe.py
+3
-3
sgl-kernel/tests/test_flash_attention.py
sgl-kernel/tests/test_flash_attention.py
+1
-1
sgl-kernel/tests/test_per_token_group_quant_8bit.py
sgl-kernel/tests/test_per_token_group_quant_8bit.py
+1
-1
sgl-router/src/tree.rs
sgl-router/src/tree.rs
+1
-1
test/srt/test_openai_server.py
test/srt/test_openai_server.py
+1
-1
test/srt/test_session_control.py
test/srt/test_session_control.py
+2
-2
test/srt/test_srt_endpoint.py
test/srt/test_srt_endpoint.py
+1
-1
test/srt/test_srt_engine_with_quant_args.py
test/srt/test_srt_engine_with_quant_args.py
+1
-1
test/srt/test_vlm_accuracy.py
test/srt/test_vlm_accuracy.py
+1
-1
No files found.
sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
View file @
2ce87935
...
@@ -20,7 +20,7 @@ def _per_token_group_quant_8bit(
...
@@ -20,7 +20,7 @@ def _per_token_group_quant_8bit(
y_s_ptr
,
y_s_ptr
,
# Stride of input
# Stride of input
y_stride
,
y_stride
,
# Col
l
ums of input
# Colum
n
s of input
N
,
N
,
# Avoid to divide zero
# Avoid to divide zero
eps
,
eps
,
...
...
sgl-kernel/csrc/cpu/common.h
View file @
2ce87935
...
@@ -49,7 +49,7 @@ namespace {
...
@@ -49,7 +49,7 @@ namespace {
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimen
t
ion")
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimen
s
ion")
#define CHECK_INPUT(x) \
#define CHECK_INPUT(x) \
CHECK_CPU(x); \
CHECK_CPU(x); \
...
...
sgl-kernel/csrc/cpu/decode.cpp
View file @
2ce87935
...
@@ -718,7 +718,7 @@ void decode_attention_kernel_impl(
...
@@ -718,7 +718,7 @@ void decode_attention_kernel_impl(
m_prime
=
m_i
;
m_prime
=
m_i
;
// caculate V' <- s_delta @ V + V' * m_delta
// ca
l
culate V' <- s_delta @ V + V' * m_delta
index_gemm_kernel_nn
<
scalar_t
,
index_t
>
(
index_gemm_kernel_nn
<
scalar_t
,
index_t
>
(
/* A */
s_delta
,
/* A */
s_delta
,
/* B */
v_buffer
+
head_id
*
v_strideH
,
/* B */
v_buffer
+
head_id
*
v_strideH
,
...
@@ -925,7 +925,7 @@ void decode_attention_grouped_kernel_impl(
...
@@ -925,7 +925,7 @@ void decode_attention_grouped_kernel_impl(
m_prime
[
h
]
=
m_i
;
m_prime
[
h
]
=
m_i
;
}
}
// caculate V' <- s_delta @ V + V' * m_delta
// ca
l
culate V' <- s_delta @ V + V' * m_delta
index_gemm_kernel_nn
<
scalar_t
,
index_t
>
(
index_gemm_kernel_nn
<
scalar_t
,
index_t
>
(
/* A */
s_delta
,
/* A */
s_delta
,
/* B */
v_buffer
+
head_kv_id
*
v_strideH
,
/* B */
v_buffer
+
head_kv_id
*
v_strideH
,
...
...
sgl-kernel/csrc/cpu/extend.cpp
View file @
2ce87935
...
@@ -323,7 +323,7 @@ void extend_attention_kernel_impl(
...
@@ -323,7 +323,7 @@ void extend_attention_kernel_impl(
/* ld_src */
v_strideN
,
/* ld_src */
v_strideN
,
/* ld_dst */
head_size_v
);
/* ld_dst */
head_size_v
);
// caculate V' <- s_delta @ V + V'
// ca
l
culate V' <- s_delta @ V + V'
at
::
native
::
cpublas
::
brgemm
(
at
::
native
::
cpublas
::
brgemm
(
/* M */
m_size
,
/* M */
m_size
,
/* N */
head_size_v
,
/* N */
head_size_v
,
...
@@ -434,7 +434,7 @@ void extend_attention_kernel_impl(
...
@@ -434,7 +434,7 @@ void extend_attention_kernel_impl(
/* ld_src */
ve_strideN
,
/* ld_src */
ve_strideN
,
/* ld_dst */
head_size_v
);
/* ld_dst */
head_size_v
);
// caculate V' <- s_delta @ V + V'
// ca
l
culate V' <- s_delta @ V + V'
at
::
native
::
cpublas
::
brgemm
(
at
::
native
::
cpublas
::
brgemm
(
/* M */
m_size
,
/* M */
m_size
,
/* N */
head_size_v
,
/* N */
head_size_v
,
...
...
sgl-kernel/csrc/cpu/gemm.h
View file @
2ce87935
...
@@ -79,7 +79,7 @@ void fused_experts_int8_kernel_impl(
...
@@ -79,7 +79,7 @@ void fused_experts_int8_kernel_impl(
int64_t
topk
,
int64_t
topk
,
int64_t
num_tokens_post_pad
);
int64_t
num_tokens_post_pad
);
// shared expert impleme
m
ntation for int8 w8a8
// shared expert implementation for int8 w8a8
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
void
shared_expert_int8_kernel_impl
(
void
shared_expert_int8_kernel_impl
(
scalar_t
*
__restrict__
output
,
scalar_t
*
__restrict__
output
,
...
...
sgl-kernel/csrc/cpu/gemm_int8.cpp
View file @
2ce87935
...
@@ -51,7 +51,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
...
@@ -51,7 +51,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
__m512
vd0
;
__m512
vd0
;
__m512
vd1
[
COLS
];
__m512
vd1
[
COLS
];
// oops! 4x4 spills but
luckly
we use 4x2
// oops! 4x4 spills but we use 4x2
__m512
vbias
[
COLS
];
__m512
vbias
[
COLS
];
// [NOTE]: s8s8 igemm compensation in avx512-vnni
// [NOTE]: s8s8 igemm compensation in avx512-vnni
...
...
sgl-kernel/csrc/speculative/packbit.cu
View file @
2ce87935
// This is only a plug
g
in used for flashinfer 0.1.6. The new version does not need it.
// This is only a plugin used for flashinfer 0.1.6. The new version does not need it.
/*
/*
* Copyright (c) 2025 by SGLang team.
* Copyright (c) 2025 by SGLang team.
* Copyright (c) 2025 by FlashInfer team.
* Copyright (c) 2025 by FlashInfer team.
...
...
sgl-kernel/include/sgl_kernel_torch_shim.h
View file @
2ce87935
...
@@ -20,16 +20,16 @@ limitations under the License.
...
@@ -20,16 +20,16 @@ limitations under the License.
#include <torch/library.h>
#include <torch/library.h>
/**
/**
* Unforunately, the type signatures of the flash_attn ops are not compatible
* Unfor
t
unately, the type signatures of the flash_attn ops are not compatible
* with the PyTorch library bindings. To get around that we use
* with the PyTorch library bindings. To get around that we use
* `make_pytorch_shim` which creates a lambda that expo
n
ses the API using
* `make_pytorch_shim` which creates a lambda that exposes the API using
* PyTorch compatible types to the types, then converts them to the types
* PyTorch compatible types to the types, then converts them to the types
* expected by the flash_attn ops. This shims allows us to make minimal changes
* expected by the flash_attn ops. This shims allows us to make minimal changes
* to `flash_api.cpp` making it easier to synchronize with upstream changes.
* to `flash_api.cpp` making it easier to synchronize with upstream changes.
*
*
* The `pytorch_library_compatible_type` struct is used to map from the
* The `pytorch_library_compatible_type` struct is used to map from the
* flash_attn ops types to a PyTorch library compatible one. The main issues is
* flash_attn ops types to a PyTorch library compatible one. The main issues is
* that the following types are not support by PyTorch libary bindings:
* that the following types are not support by PyTorch lib
r
ary bindings:
* - `int`
* - `int`
* - `float`
* - `float`
* - `std::optional<T> &`
* - `std::optional<T> &`
...
...
sgl-kernel/python/sgl_kernel/elementwise.py
View file @
2ce87935
...
@@ -229,7 +229,7 @@ def apply_rope_with_cos_sin_cache_inplace(
...
@@ -229,7 +229,7 @@ def apply_rope_with_cos_sin_cache_inplace(
Whether to use Neox style RoPE, default: ``True``.
Whether to use Neox style RoPE, default: ``True``.
* If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
* If ``True``, the last dimension of the query/key tensor is not interleaved, i.e.,
we ro
r
ate the first half dimensions ``([..., :head_dim//2])`` and the second half
we ro
t
ate the first half dimensions ``([..., :head_dim//2])`` and the second half
dimensions ``([..., head_dim//2:])``.
dimensions ``([..., head_dim//2:])``.
* If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
* If ``False``, the last dimension of the query/key tensor is interleaved, i.e.,
...
...
sgl-kernel/python/sgl_kernel/flash_attn.py
View file @
2ce87935
...
@@ -17,7 +17,7 @@ def is_fa3_supported(device=None) -> bool:
...
@@ -17,7 +17,7 @@ def is_fa3_supported(device=None) -> bool:
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# That
s
mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
# That mean
s
if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
return
(
return
(
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
9
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
9
or
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
8
or
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
8
...
...
sgl-kernel/python/sgl_kernel/moe.py
View file @
2ce87935
...
@@ -45,10 +45,10 @@ def moe_fused_gate(
...
@@ -45,10 +45,10 @@ def moe_fused_gate(
):
):
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# it split group of expert into num_expert_group, and use top2 expert weight sum in each group
# as the group weight to select exer
p
t groups and then select topk experts within the selected groups
# as the group weight to select ex
p
ert groups and then select topk experts within the selected groups
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limit
t
ed for now.
# and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now.
# for non-supported case, we suggest
ion
to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
# routed_scaling_factor: if > 0, the last expert will be scaled by this factor
return
torch
.
ops
.
sgl_kernel
.
moe_fused_gate
.
default
(
return
torch
.
ops
.
sgl_kernel
.
moe_fused_gate
.
default
(
...
...
sgl-kernel/tests/test_flash_attention.py
View file @
2ce87935
...
@@ -24,7 +24,7 @@ def is_fa3_supported(device=None) -> bool:
...
@@ -24,7 +24,7 @@ def is_fa3_supported(device=None) -> bool:
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# That
s
mean if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
# That mean
s
if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
return
(
return
(
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
9
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
9
or
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
8
or
torch
.
cuda
.
get_device_capability
(
device
)[
0
]
==
8
...
...
sgl-kernel/tests/test_per_token_group_quant_8bit.py
View file @
2ce87935
...
@@ -21,7 +21,7 @@ def _per_token_group_quant_fp8(
...
@@ -21,7 +21,7 @@ def _per_token_group_quant_fp8(
y_s_ptr
,
y_s_ptr
,
# Stride of input
# Stride of input
y_stride
,
y_stride
,
# Col
l
ums of input
# Colum
n
s of input
N
,
N
,
# Avoid to divide zero
# Avoid to divide zero
eps
,
eps
,
...
...
sgl-router/src/tree.rs
View file @
2ce87935
...
@@ -1070,7 +1070,7 @@ mod tests {
...
@@ -1070,7 +1070,7 @@ mod tests {
#[test]
#[test]
fn
test_utf8_split_seq
()
{
fn
test_utf8_split_seq
()
{
// The string should be indexed and split
ted
by a utf-8 value basis instead of byte basis
// The string should be indexed and split by a utf-8 value basis instead of byte basis
// use .chars() to get the iterator of the utf-8 value
// use .chars() to get the iterator of the utf-8 value
let
tree
=
Arc
::
new
(
Tree
::
new
());
let
tree
=
Arc
::
new
(
Tree
::
new
());
...
...
test/srt/test_openai_server.py
View file @
2ce87935
...
@@ -433,7 +433,7 @@ class TestOpenAIServer(CustomTestCase):
...
@@ -433,7 +433,7 @@ class TestOpenAIServer(CustomTestCase):
)
)
def
test_completion_stream
(
self
):
def
test_completion_stream
(
self
):
# parallel sampling a
d
n list input are not supported in streaming mode
# parallel sampling an
d
list input are not supported in streaming mode
for
echo
in
[
False
,
True
]:
for
echo
in
[
False
,
True
]:
for
logprobs
in
[
None
,
5
]:
for
logprobs
in
[
None
,
5
]:
for
use_list_input
in
[
True
,
False
]:
for
use_list_input
in
[
True
,
False
]:
...
...
test/srt/test_session_control.py
View file @
2ce87935
...
@@ -161,7 +161,7 @@ class TestSessionControl(CustomTestCase):
...
@@ -161,7 +161,7 @@ class TestSessionControl(CustomTestCase):
]
]
)
)
# query with a non-existing rid (the last one should be disappeared bec
u
ase of backtrack), should see abort
# query with a non-existing rid (the last one should be disappeared beca
u
se of backtrack), should see abort
response
=
requests
.
post
(
response
=
requests
.
post
(
self
.
base_url
+
"/generate"
,
self
.
base_url
+
"/generate"
,
json
=
{
json
=
{
...
@@ -668,7 +668,7 @@ class TestSessionControlVision(CustomTestCase):
...
@@ -668,7 +668,7 @@ class TestSessionControlVision(CustomTestCase):
).
json
()
).
json
()
outputs_from_session
.
append
(
response
[
"text"
])
outputs_from_session
.
append
(
response
[
"text"
])
# query with a non-existing rid (the last one should be disappeared bec
u
ase of backtrack), should see abort
# query with a non-existing rid (the last one should be disappeared beca
u
se of backtrack), should see abort
response
=
requests
.
post
(
response
=
requests
.
post
(
self
.
base_url
+
"/generate"
,
self
.
base_url
+
"/generate"
,
json
=
{
json
=
{
...
...
test/srt/test_srt_endpoint.py
View file @
2ce87935
...
@@ -295,7 +295,7 @@ class TestSRTEndpoint(CustomTestCase):
...
@@ -295,7 +295,7 @@ class TestSRTEndpoint(CustomTestCase):
print
(
f
"
{
output_top_logprobs
=
}
"
)
print
(
f
"
{
output_top_logprobs
=
}
"
)
# Parse results
# Parse results
# This is becau
e
s the grammar constraint allows all prefix tokens
# This is becaus
e
the grammar constraint allows all prefix tokens
logprobs
=
[
None
]
*
2
logprobs
=
[
None
]
*
2
for
i
in
range
(
len
(
output_top_logprobs
)):
for
i
in
range
(
len
(
output_top_logprobs
)):
try
:
try
:
...
...
test/srt/test_srt_engine_with_quant_args.py
View file @
2ce87935
...
@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
...
@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
def
test_1_quantization_args
(
self
):
def
test_1_quantization_args
(
self
):
# we only test fp8 because other methods are currenly depend on vllm. We can add other methods back to test after vllm depency is resolved.
# we only test fp8 because other methods are curren
t
ly depend
ent
on vllm. We can add other methods back to test after vllm depen
den
cy is resolved.
quantization_args_list
=
[
quantization_args_list
=
[
# "awq",
# "awq",
"fp8"
,
"fp8"
,
...
...
test/srt/test_vlm_accuracy.py
View file @
2ce87935
...
@@ -116,7 +116,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
...
@@ -116,7 +116,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
}},
}},
{{
{{
"type": "text",
"type": "text",
"text": "Whats in this picture?"
"text": "What
'
s in this picture?"
}}
}}
]
]
}}
}}
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment