Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
730d084f
Unverified
Commit
730d084f
authored
Mar 09, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 09, 2025
Browse files
Minor style fix for sgl-kernel (#4243)
parent
4a05bdfa
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
20 additions
and
22 deletions
+20
-22
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+2
-2
sgl-kernel/csrc/torch_extension.cc
sgl-kernel/csrc/torch_extension.cc
+8
-9
sgl-kernel/include/sgl_kernel_ops.h
sgl-kernel/include/sgl_kernel_ops.h
+9
-9
sgl-kernel/setup.py
sgl-kernel/setup.py
+1
-0
test/srt/test_mla_flashinfer.py
test/srt/test_mla_flashinfer.py
+0
-2
No files found.
python/sglang/srt/server_args.py
View file @
730d084f
...
@@ -278,10 +278,10 @@ class ServerArgs:
...
@@ -278,10 +278,10 @@ class ServerArgs:
if
self
.
speculative_algorithm
==
"EAGLE"
:
if
self
.
speculative_algorithm
==
"EAGLE"
:
if
self
.
max_running_requests
is
None
:
if
self
.
max_running_requests
is
None
:
self
.
max_running_requests
=
32
self
.
max_running_requests
=
32
self
.
disable_overlap_schedule
=
True
self
.
disable_cuda_graph_padding
=
True
self
.
disable_cuda_graph_padding
=
True
self
.
disable_overlap_schedule
=
True
logger
.
info
(
logger
.
info
(
"Overlap scheduler
are
disabled because of using "
"Overlap scheduler
is
disabled because of using "
"eagle speculative decoding."
"eagle speculative decoding."
)
)
# The token generated from the verify step is counted.
# The token generated from the verify step is counted.
...
...
sgl-kernel/csrc/torch_extension.cc
View file @
730d084f
...
@@ -41,6 +41,9 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
...
@@ -41,6 +41,9 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
/*
/*
* From csrc/attention
* From csrc/attention
*/
*/
m
.
def
(
"lightning_attention_decode(Tensor q, Tensor k, Tensor v, Tensor past_kv, Tensor slope, Tensor! output, Tensor! "
"new_kv) -> ()"
);
m
.
impl
(
"lightning_attention_decode"
,
torch
::
kCUDA
,
&
lightning_attention_decode
);
m
.
impl
(
"lightning_attention_decode"
,
torch
::
kCUDA
,
&
lightning_attention_decode
);
/*
/*
...
@@ -67,6 +70,11 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
...
@@ -67,6 +70,11 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
m
.
def
(
"gelu_and_mul(Tensor! out, Tensor input, int cuda_stream) -> ()"
);
m
.
def
(
"gelu_and_mul(Tensor! out, Tensor input, int cuda_stream) -> ()"
);
m
.
impl
(
"gelu_and_mul"
,
torch
::
kCUDA
,
&
gelu_and_mul
);
m
.
impl
(
"gelu_and_mul"
,
torch
::
kCUDA
,
&
gelu_and_mul
);
m
.
def
(
"apply_rope_pos_ids_cos_sin_cache(Tensor q, Tensor k, Tensor! q_rope, Tensor! k_rope, Tensor cos_sin_cache, "
"Tensor pos_ids, bool interleave, int cuda_stream) -> ()"
);
m
.
impl
(
"apply_rope_pos_ids_cos_sin_cache"
,
torch
::
kCUDA
,
&
apply_rope_pos_ids_cos_sin_cache
);
/*
/*
* From csrc/gemm
* From csrc/gemm
*/
*/
...
@@ -109,10 +117,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
...
@@ -109,10 +117,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
"experts_ids, Tensor! num_tokens_post_pad, Tensor! token_cnts_buffer, Tensor! cumsum_buffer) -> ()"
);
"experts_ids, Tensor! num_tokens_post_pad, Tensor! token_cnts_buffer, Tensor! cumsum_buffer) -> ()"
);
m
.
impl
(
"moe_align_block_size"
,
torch
::
kCUDA
,
&
moe_align_block_size
);
m
.
impl
(
"moe_align_block_size"
,
torch
::
kCUDA
,
&
moe_align_block_size
);
m
.
def
(
"lightning_attention_decode(Tensor q, Tensor k, Tensor v, Tensor past_kv, Tensor slope, Tensor! output, Tensor! "
"new_kv) -> ()"
);
/*
/*
* From csrc/speculative
* From csrc/speculative
*/
*/
...
@@ -169,11 +173,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
...
@@ -169,11 +173,6 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
"top_p_sampling_from_probs(Tensor probs, Tensor uniform_samples, Tensor! samples, Tensor! success, Tensor? "
"top_p_sampling_from_probs(Tensor probs, Tensor uniform_samples, Tensor! samples, Tensor! success, Tensor? "
"maybe_top_p_arr, float top_p_val, bool deterministic, int cuda_stream) -> ()"
);
"maybe_top_p_arr, float top_p_val, bool deterministic, int cuda_stream) -> ()"
);
m
.
impl
(
"top_p_sampling_from_probs"
,
torch
::
kCUDA
,
&
top_p_sampling_from_probs
);
m
.
impl
(
"top_p_sampling_from_probs"
,
torch
::
kCUDA
,
&
top_p_sampling_from_probs
);
m
.
def
(
"apply_rope_pos_ids_cos_sin_cache(Tensor q, Tensor k, Tensor! q_rope, Tensor! k_rope, Tensor cos_sin_cache, "
"Tensor pos_ids, bool interleave, int cuda_stream) -> ()"
);
m
.
impl
(
"apply_rope_pos_ids_cos_sin_cache"
,
torch
::
kCUDA
,
&
apply_rope_pos_ids_cos_sin_cache
);
}
}
REGISTER_EXTENSION
(
common_ops
)
REGISTER_EXTENSION
(
common_ops
)
sgl-kernel/include/sgl_kernel_ops.h
View file @
730d084f
...
@@ -99,6 +99,15 @@ void gemma_fused_add_rmsnorm(
...
@@ -99,6 +99,15 @@ void gemma_fused_add_rmsnorm(
void
silu_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
silu_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
gelu_tanh_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
gelu_tanh_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
gelu_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
gelu_and_mul
(
at
::
Tensor
&
out
,
at
::
Tensor
&
input
,
int64_t
cuda_stream
);
void
apply_rope_pos_ids_cos_sin_cache
(
at
::
Tensor
q
,
at
::
Tensor
k
,
at
::
Tensor
q_rope
,
at
::
Tensor
k_rope
,
at
::
Tensor
cos_sin_cache
,
at
::
Tensor
pos_ids
,
bool
interleave
,
int64_t
cuda_stream
);
/*
/*
* From csrc/gemm
* From csrc/gemm
...
@@ -258,12 +267,3 @@ void top_p_sampling_from_probs(
...
@@ -258,12 +267,3 @@ void top_p_sampling_from_probs(
double
top_p_val
,
double
top_p_val
,
bool
deterministic
,
bool
deterministic
,
int64_t
cuda_stream
);
int64_t
cuda_stream
);
void
apply_rope_pos_ids_cos_sin_cache
(
at
::
Tensor
q
,
at
::
Tensor
k
,
at
::
Tensor
q_rope
,
at
::
Tensor
k_rope
,
at
::
Tensor
cos_sin_cache
,
at
::
Tensor
pos_ids
,
bool
interleave
,
int64_t
cuda_stream
);
sgl-kernel/setup.py
View file @
730d084f
...
@@ -76,6 +76,7 @@ nvcc_flags = [
...
@@ -76,6 +76,7 @@ nvcc_flags = [
"-std=c++17"
,
"-std=c++17"
,
"-use_fast_math"
,
"-use_fast_math"
,
"-DFLASHINFER_ENABLE_F16"
,
"-DFLASHINFER_ENABLE_F16"
,
"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
,
"-DCUTLASS_VERSIONS_GENERATED"
,
"-DCUTLASS_VERSIONS_GENERATED"
,
"-DCUTE_USE_PACKED_TUPLE=1"
,
"-DCUTE_USE_PACKED_TUPLE=1"
,
"-DCUTLASS_TEST_LEVEL=0"
,
"-DCUTLASS_TEST_LEVEL=0"
,
...
...
test/srt/test_mla_flashinfer.py
View file @
730d084f
...
@@ -6,9 +6,7 @@ import torch
...
@@ -6,9 +6,7 @@ import torch
from
sglang.srt.utils
import
kill_process_tree
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
popen_launch_server
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment