Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
11f881d1
Unverified
Commit
11f881d1
authored
Nov 17, 2024
by
Lianmin Zheng
Committed by
GitHub
Nov 17, 2024
Browse files
Deprecate --disable-flashinfer and --disable-flashinfer-sampling (#2065)
parent
38625e21
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
28 deletions
+25
-28
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+22
-26
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+2
-0
test/srt/test_torch_compile_moe.py
test/srt/test_torch_compile_moe.py
+1
-2
No files found.
python/sglang/srt/server_args.py
View file @
11f881d1
...
...
@@ -116,8 +116,6 @@ class ServerArgs:
grammar_backend
:
Optional
[
str
]
=
"outlines"
# Optimization/debug options
disable_flashinfer
:
bool
=
False
disable_flashinfer_sampling
:
bool
=
False
disable_radix_cache
:
bool
=
False
disable_jump_forward
:
bool
=
False
disable_cuda_graph
:
bool
=
False
...
...
@@ -179,20 +177,6 @@ class ServerArgs:
self
.
chunked_prefill_size
//=
4
# make it 2048
self
.
cuda_graph_max_bs
=
4
# Deprecation warnings
if
self
.
disable_flashinfer
:
logger
.
warning
(
"The option '--disable-flashinfer' will be deprecated in the next release. "
"Please use '--attention-backend triton' instead."
)
self
.
attention_backend
=
"triton"
if
self
.
disable_flashinfer_sampling
:
logger
.
warning
(
"The option '--disable-flashinfer-sampling' will be deprecated in the next release. "
"Please use '--sampling-backend pytorch' instead. "
)
self
.
sampling_backend
=
"pytorch"
if
not
is_flashinfer_available
():
self
.
attention_backend
=
"triton"
self
.
sampling_backend
=
"pytorch"
...
...
@@ -615,16 +599,6 @@ class ServerArgs:
)
# Optimization/debug options
parser
.
add_argument
(
"--disable-flashinfer"
,
action
=
"store_true"
,
help
=
"Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead."
,
)
parser
.
add_argument
(
"--disable-flashinfer-sampling"
,
action
=
"store_true"
,
help
=
"Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead."
,
)
parser
.
add_argument
(
"--disable-radix-cache"
,
action
=
"store_true"
,
...
...
@@ -733,6 +707,18 @@ class ServerArgs:
help
=
"Delete the model checkpoint after loading the model."
,
)
# Deprecated arguments
parser
.
add_argument
(
"--disable-flashinfer"
,
action
=
DeprecatedAction
,
help
=
"'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead."
,
)
parser
.
add_argument
(
"--disable-flashinfer-sampling"
,
action
=
DeprecatedAction
,
help
=
"'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead."
,
)
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
args
.
tp_size
=
args
.
tensor_parallel_size
...
...
@@ -826,3 +812,13 @@ class LoRAPathAction(argparse.Action):
getattr
(
namespace
,
self
.
dest
)[
name
]
=
path
else
:
getattr
(
namespace
,
self
.
dest
)[
lora_path
]
=
lora_path
class
DeprecatedAction
(
argparse
.
Action
):
def
__init__
(
self
,
option_strings
,
dest
,
nargs
=
0
,
**
kwargs
):
super
(
DeprecatedAction
,
self
).
__init__
(
option_strings
,
dest
,
nargs
=
nargs
,
**
kwargs
)
def
__call__
(
self
,
parser
,
namespace
,
values
,
option_string
=
None
):
raise
ValueError
(
self
.
help
)
python/sglang/srt/utils.py
View file @
11f881d1
...
...
@@ -71,6 +71,8 @@ def is_flashinfer_available():
Check whether flashinfer is available.
As of Oct. 6, 2024, it is only available on NVIDIA GPUs.
"""
if
os
.
environ
.
get
(
"SGLANG_IS_FLASHINFER_AVAILABLE"
,
"true"
)
==
"false"
:
return
False
return
torch
.
cuda
.
is_available
()
and
not
is_hip
()
...
...
test/srt/test_torch_compile_moe.py
View file @
11f881d1
...
...
@@ -65,8 +65,7 @@ class TestTorchCompile(unittest.TestCase):
tok
=
time
.
time
()
print
(
f
"
{
res
=
}
"
)
throughput
=
max_tokens
/
(
tok
-
tic
)
print
(
f
"Throughput:
{
throughput
}
tokens/s"
)
self
.
assertGreaterEqual
(
throughput
,
290
)
self
.
assertGreaterEqual
(
throughput
,
285
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment