Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9380f50f
Unverified
Commit
9380f50f
authored
Jul 02, 2024
by
Ying Sheng
Committed by
GitHub
Jul 02, 2024
Browse files
Turn on flashinfer by default (#578)
parent
95dc093b
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
14 additions
and
27 deletions
+14
-27
README.md
README.md
+6
-1
docs/flashinfer.md
docs/flashinfer.md
+0
-18
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+1
-1
python/sglang/srt/managers/controller/model_runner.py
python/sglang/srt/managers/controller/model_runner.py
+3
-3
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+4
-4
No files found.
README.md
View file @
9380f50f
...
...
@@ -34,6 +34,8 @@ The core features include:
pip install "sglang[all]"
```
Next,
[
install FlashInfer
](
https://docs.flashinfer.ai/installation.html
)
for attention CUDA kernels.
### Method 2: From source
```
git clone https://github.com/sgl-project/sglang.git
...
...
@@ -43,7 +45,11 @@ pip install --upgrade pip
pip install -e "python[all]"
```
Next,
[
install FlashInfer
](
https://docs.flashinfer.ai/installation.html
)
for attention CUDA kernels.
### Notes
-
If you see triton errors, please install the
[
Triton Nightly
](
https://triton-lang.org/main/getting-started/installation.html
)
.
-
If you cannot install FlashInfer, you can use the slower triton kernels by adding
`--disable-flashinfer`
when launching the server.
-
If you only need to use the OpenAI backend, you can avoid installing other dependencies by using
`pip install "sglang[openai]"`
## Quick Start
...
...
@@ -363,7 +369,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
```
-
See
[
flashinfer.md
](
docs/flashinfer.md
)
on accelerating inference using highly optimized CUDA kernels.
-
See
[
hyperparameter_tuning.md
](
docs/hyperparameter_tuning.md
)
on tuning hyperparameters for better performance.
### Supported Models
...
...
docs/flashinfer.md
deleted
100644 → 0
View file @
95dc093b
## Flashinfer Mode
[
flashinfer
](
https://github.com/flashinfer-ai/flashinfer
)
is a kernel library for LLM serving.
It can be used in SGLang runtime to accelerate attention computation.
### Install flashinfer
See https://docs.flashinfer.ai/installation.html.
### Run a Server With Flashinfer Mode
Add
`--enable-flashinfer`
argument to enable flashinfer when launching a server.
Example:
```
bash
python
-m
sglang.launch_server
--model-path
meta-llama/Llama-2-7b-chat-hf
--port
30000
--enable-flashinfer
```
python/sglang/srt/layers/radix_attention.py
View file @
9380f50f
...
...
@@ -26,7 +26,7 @@ class RadixAttention(nn.Module):
from
sglang.srt.managers.controller.model_runner
import
global_server_args_dict
if
global_server_args_dict
.
get
(
"
en
able_flashinfer"
,
False
):
if
not
global_server_args_dict
.
get
(
"
dis
able_flashinfer"
,
False
):
self
.
prefill_forward
=
self
.
prefill_forward_flashinfer
self
.
extend_forward
=
self
.
prefill_forward_flashinfer
self
.
decode_forward
=
self
.
decode_forward_flashinfer
...
...
python/sglang/srt/managers/controller/model_runner.py
View file @
9380f50f
...
...
@@ -201,7 +201,7 @@ class InputMetadata:
if
forward_mode
==
ForwardMode
.
EXTEND
:
ret
.
init_extend_args
()
if
global_server_args_dict
.
get
(
"
en
able_flashinfer"
,
False
):
if
not
global_server_args_dict
.
get
(
"
dis
able_flashinfer"
,
False
):
ret
.
init_flashinfer_args
(
model_runner
.
model_config
.
num_attention_heads
//
tp_size
,
model_runner
.
model_config
.
get_num_kv_heads
(
tp_size
),
...
...
@@ -263,7 +263,7 @@ class ModelRunner:
# Set some global args
global
global_server_args_dict
global_server_args_dict
=
{
"
en
able_flashinfer"
:
server_args
.
en
able_flashinfer
,
"
dis
able_flashinfer"
:
server_args
.
dis
able_flashinfer
,
"attention_reduce_in_fp32"
:
server_args
.
attention_reduce_in_fp32
,
}
...
...
@@ -359,7 +359,7 @@ class ModelRunner:
return
c
def
init_flash_infer
(
self
):
if
global_server_args_dict
.
get
(
"
en
able_flashinfer"
,
False
):
if
not
global_server_args_dict
.
get
(
"
dis
able_flashinfer"
,
False
):
from
flashinfer
import
(
BatchPrefillWithPagedKVCacheWrapper
,
BatchDecodeWithPagedKVCacheWrapper
,
...
...
python/sglang/srt/server_args.py
View file @
9380f50f
...
...
@@ -50,7 +50,7 @@ class ServerArgs:
load_balance_method
:
str
=
"round_robin"
# Optimization/debug options
en
able_flashinfer
:
bool
=
Fals
e
dis
able_flashinfer
:
bool
=
Tru
e
attention_reduce_in_fp32
:
bool
=
False
disable_radix_cache
:
bool
=
False
disable_regex_jump_forward
:
bool
=
False
...
...
@@ -287,9 +287,9 @@ class ServerArgs:
# Optimization/debug options
parser
.
add_argument
(
"--
en
able-flashinfer"
,
"--
dis
able-flashinfer"
,
action
=
"store_true"
,
help
=
"
En
able flashinfer inference kernels"
,
help
=
"
Dis
able flashinfer inference kernels"
,
)
parser
.
add_argument
(
"--attention-reduce-in-fp32"
,
...
...
@@ -322,7 +322,7 @@ class ServerArgs:
def
print_mode_args
(
self
):
return
(
f
"
en
able_flashinfer=
{
self
.
en
able_flashinfer
}
, "
f
"
dis
able_flashinfer=
{
self
.
dis
able_flashinfer
}
, "
f
"attention_reduce_in_fp32=
{
self
.
attention_reduce_in_fp32
}
, "
f
"disable_radix_cache=
{
self
.
disable_radix_cache
}
, "
f
"disable_regex_jump_forward=
{
self
.
disable_regex_jump_forward
}
, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment