Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
ebaa2f31
Unverified
Commit
ebaa2f31
authored
Nov 17, 2024
by
Lianmin Zheng
Committed by
GitHub
Nov 17, 2024
Browse files
Rename arguments `--disable-nan-detection` to `--enable-nan-detection` (#2066)
parent
62832bb2
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
17 additions
and
20 deletions
+17
-20
python/sglang/srt/layers/sampler.py
python/sglang/srt/layers/sampler.py
+1
-1
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+1
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+5
-1
python/sglang/srt/models/gemma2.py
python/sglang/srt/models/gemma2.py
+1
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+9
-17
No files found.
python/sglang/srt/layers/sampler.py
View file @
ebaa2f31
...
...
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
class
Sampler
(
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
use_nan_detectioin
=
not
global_server_args_dict
[
"
dis
able_nan_detection"
]
self
.
use_nan_detectioin
=
global_server_args_dict
[
"
en
able_nan_detection"
]
def
forward
(
self
,
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
ebaa2f31
...
...
@@ -57,7 +57,7 @@ global_server_args_dict = {
"triton_attention_reduce_in_fp32"
:
ServerArgs
.
triton_attention_reduce_in_fp32
,
"disable_mla"
:
ServerArgs
.
disable_mla
,
"torchao_config"
:
ServerArgs
.
torchao_config
,
"
dis
able_nan_detection"
:
ServerArgs
.
dis
able_nan_detection
,
"
en
able_nan_detection"
:
ServerArgs
.
en
able_nan_detection
,
"enable_dp_attention"
:
ServerArgs
.
enable_dp_attention
,
}
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
ebaa2f31
...
...
@@ -139,7 +139,7 @@ class ModelRunner:
"disable_mla"
:
server_args
.
disable_mla
,
"torchao_config"
:
server_args
.
torchao_config
,
"disable_penalizer"
:
server_args
.
disable_penalizer
,
"
dis
able_nan_detection"
:
server_args
.
dis
able_nan_detection
,
"
en
able_nan_detection"
:
server_args
.
en
able_nan_detection
,
"enable_dp_attention"
:
server_args
.
enable_dp_attention
,
}
)
...
...
@@ -276,6 +276,10 @@ class ModelRunner:
else
None
)
self
.
dtype
=
self
.
vllm_model_config
.
dtype
if
self
.
sliding_window_size
:
assert
(
self
.
server_args
.
attention_backend
==
"flashinfer"
),
"Only flashinfer supports window attention."
logger
.
info
(
f
"Load weight end. "
...
...
python/sglang/srt/models/gemma2.py
View file @
ebaa2f31
...
...
@@ -332,6 +332,7 @@ class Gemma2ForCausalLM(nn.Module):
# Gemma does not apply LoRA to the embedding layer.
embedding_modules
=
{}
embedding_padding_modules
=
[]
supports_lora
=
True
def
__init__
(
self
,
...
...
python/sglang/srt/server_args.py
View file @
ebaa2f31
...
...
@@ -124,7 +124,6 @@ class ServerArgs:
disable_custom_all_reduce
:
bool
=
False
disable_mla
:
bool
=
False
disable_penalizer
:
bool
=
False
disable_nan_detection
:
bool
=
False
enable_overlap_schedule
:
bool
=
False
enable_mixed_chunk
:
bool
=
False
enable_dp_attention
:
bool
=
False
...
...
@@ -132,6 +131,7 @@ class ServerArgs:
torch_compile_max_bs
:
int
=
32
cuda_graph_max_bs
:
int
=
160
torchao_config
:
str
=
""
enable_nan_detection
:
bool
=
False
enable_p2p_check
:
bool
=
False
triton_attention_reduce_in_fp32
:
bool
=
False
num_continuous_decode_steps
:
int
=
1
...
...
@@ -171,11 +171,11 @@ class ServerArgs:
else
:
gpu_mem
=
get_nvgpu_memory_capacity
()
if
gpu_mem
<
25000
:
self
.
chunked_prefill_size
//=
4
# make it 2048
self
.
cuda_graph_max_bs
=
4
logger
.
warning
(
"Automatically adjust --chunked-prefill-size for small GPUs."
)
self
.
chunked_prefill_size
//=
4
# make it 2048
self
.
cuda_graph_max_bs
=
4
if
not
is_flashinfer_available
():
self
.
attention_backend
=
"triton"
...
...
@@ -194,7 +194,7 @@ class ServerArgs:
self
.
cuda_graph_max_bs
=
min
(
self
.
cuda_graph_max_bs
,
96
)
self
.
enable_overlap_schedule
=
False
logger
.
warning
(
f
"DP attention is enabled. The chunked prefill size is adjusted to
{
self
.
chunked_prefill_size
}
to avoid MoE
workload
issue. "
f
"DP attention is enabled. The chunked prefill size is adjusted to
{
self
.
chunked_prefill_size
}
to avoid MoE
kernel
issue
s
. "
f
"The CUDA graph max batch size is adjusted to
{
self
.
cuda_graph_max_bs
}
. "
"Data parallel size is adjusted to be the same as tensor parallel size."
)
...
...
@@ -204,21 +204,8 @@ class ServerArgs:
"Overlap scheduler mode is enabled. This is an experimental feature. "
"Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
"and embedding APIs are not supported and will lead to wrong results. "
"The NaN detection is also disabled."
)
self
.
disable_penalizer
=
True
self
.
disable_nan_detection
=
True
# Model-specific patches
if
"Alibaba-NLP/gte-Qwen2-1.5B-instruct"
==
self
.
model_path
:
logger
.
info
(
"Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
)
self
.
trust_remote_code
=
False
if
"gemma-2"
in
self
.
model_path
.
lower
():
logger
.
info
(
"When using sliding window in gemma-2, turn on flashinfer."
)
self
.
attention_backend
=
"flashinfer"
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
):
...
...
@@ -683,6 +670,11 @@ class ServerArgs:
default
=
ServerArgs
.
torchao_config
,
help
=
"Optimize the model with torchao. Experimental feature. Current choices are: int8dq, int8wo, int4wo-<group_size>, fp8wo"
,
)
parser
.
add_argument
(
"--enable-nan-detection"
,
action
=
"store_true"
,
help
=
"Enable the NaN detection for debugging purposes."
,
)
parser
.
add_argument
(
"--enable-p2p-check"
,
action
=
"store_true"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment