Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3ddb1c46
"tools/utils/array_readwriter/csv.py" did not exist on "60bc0b7692c6733dd930cd4a502f4171720da38d"
Unverified
Commit
3ddb1c46
authored
Dec 02, 2024
by
Lianmin Zheng
Committed by
GitHub
Dec 02, 2024
Browse files
[Minor] Fix logger and style (#2325)
parent
480e38a7
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
9 deletions
+16
-9
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+0
-1
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+4
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+12
-7
No files found.
python/sglang/bench_serving.py
View file @
3ddb1c46
...
@@ -163,7 +163,6 @@ async def async_request_openai_completions(
...
@@ -163,7 +163,6 @@ async def async_request_openai_completions(
"max_tokens"
:
request_func_input
.
output_len
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
not
args
.
disable_stream
,
"stream"
:
not
args
.
disable_stream
,
"ignore_eos"
:
not
args
.
disable_ignore_eos
,
"ignore_eos"
:
not
args
.
disable_ignore_eos
,
"lora_path"
:
request_func_input
.
lora_name
,
**
request_func_input
.
extra_request_body
,
**
request_func_input
.
extra_request_body
,
}
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
3ddb1c46
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
import
gc
import
gc
import
json
import
json
import
logging
import
logging
import
time
from
typing
import
Optional
from
typing
import
Optional
import
torch
import
torch
...
@@ -129,7 +130,7 @@ class ModelRunner:
...
@@ -129,7 +130,7 @@ class ModelRunner:
# Global vars
# Global vars
if
server_args
.
show_time_cost
:
if
server_args
.
show_time_cost
:
enable_show_time_cost
()
enable_show_time_cost
()
if
server_args
.
disable_disk_cache
:
if
server_args
.
disable_
outlines_
disk_cache
:
from
outlines.caching
import
disable_cache
from
outlines.caching
import
disable_cache
disable_cache
()
disable_cache
()
...
@@ -623,8 +624,10 @@ class ModelRunner:
...
@@ -623,8 +624,10 @@ class ModelRunner:
if
self
.
server_args
.
disable_cuda_graph
:
if
self
.
server_args
.
disable_cuda_graph
:
return
return
tic
=
time
.
time
()
logger
.
info
(
"Capture cuda graph begin. This can take up to several minutes."
)
logger
.
info
(
"Capture cuda graph begin. This can take up to several minutes."
)
self
.
cuda_graph_runner
=
CudaGraphRunner
(
self
)
self
.
cuda_graph_runner
=
CudaGraphRunner
(
self
)
logger
.
info
(
f
"Capture cuda graph end. Time elapsed:
{
time
.
time
()
-
tic
:.
2
f
}
s"
)
def
apply_torch_tp
(
self
):
def
apply_torch_tp
(
self
):
logger
.
info
(
f
"Enabling torch tensor parallelism on
{
self
.
tp_size
}
devices."
)
logger
.
info
(
f
"Enabling torch tensor parallelism on
{
self
.
tp_size
}
devices."
)
...
...
python/sglang/srt/server_args.py
View file @
3ddb1c46
...
@@ -122,7 +122,7 @@ class ServerArgs:
...
@@ -122,7 +122,7 @@ class ServerArgs:
disable_jump_forward
:
bool
=
False
disable_jump_forward
:
bool
=
False
disable_cuda_graph
:
bool
=
False
disable_cuda_graph
:
bool
=
False
disable_cuda_graph_padding
:
bool
=
False
disable_cuda_graph_padding
:
bool
=
False
disable_disk_cache
:
bool
=
False
disable_
outlines_
disk_cache
:
bool
=
False
disable_custom_all_reduce
:
bool
=
False
disable_custom_all_reduce
:
bool
=
False
disable_mla
:
bool
=
False
disable_mla
:
bool
=
False
disable_overlap_schedule
:
bool
=
False
disable_overlap_schedule
:
bool
=
False
...
@@ -159,7 +159,7 @@ class ServerArgs:
...
@@ -159,7 +159,7 @@ class ServerArgs:
if
self
.
tp_size
>=
16
:
if
self
.
tp_size
>=
16
:
self
.
mem_fraction_static
=
0.79
self
.
mem_fraction_static
=
0.79
elif
self
.
tp_size
>=
8
:
elif
self
.
tp_size
>=
8
:
self
.
mem_fraction_static
=
0.8
2
self
.
mem_fraction_static
=
0.8
1
elif
self
.
tp_size
>=
4
:
elif
self
.
tp_size
>=
4
:
self
.
mem_fraction_static
=
0.85
self
.
mem_fraction_static
=
0.85
elif
self
.
tp_size
>=
2
:
elif
self
.
tp_size
>=
2
:
...
@@ -192,7 +192,7 @@ class ServerArgs:
...
@@ -192,7 +192,7 @@ class ServerArgs:
)
)
if
self
.
attention_backend
==
"torch_native"
:
if
self
.
attention_backend
==
"torch_native"
:
logger
.
info
(
logger
.
warning
(
"Cuda graph is disabled because of using torch native attention backend"
"Cuda graph is disabled because of using torch native attention backend"
)
)
self
.
disable_cuda_graph
=
True
self
.
disable_cuda_graph
=
True
...
@@ -204,12 +204,12 @@ class ServerArgs:
...
@@ -204,12 +204,12 @@ class ServerArgs:
self
.
cuda_graph_max_bs
=
min
(
self
.
cuda_graph_max_bs
,
96
)
self
.
cuda_graph_max_bs
=
min
(
self
.
cuda_graph_max_bs
,
96
)
self
.
schedule_conservativeness
=
self
.
schedule_conservativeness
*
0.3
self
.
schedule_conservativeness
=
self
.
schedule_conservativeness
*
0.3
self
.
disable_overlap_schedule
=
True
self
.
disable_overlap_schedule
=
True
logger
.
info
(
logger
.
warning
(
f
"DP attention is enabled. The chunked prefill size is adjusted to
{
self
.
chunked_prefill_size
}
to avoid MoE kernel issues. "
f
"DP attention is enabled. The chunked prefill size is adjusted to
{
self
.
chunked_prefill_size
}
to avoid MoE kernel issues. "
f
"The CUDA graph max batch size is adjusted to
{
self
.
cuda_graph_max_bs
}
. "
f
"The CUDA graph max batch size is adjusted to
{
self
.
cuda_graph_max_bs
}
. "
f
"The schedule conservativeness is adjusted to
{
self
.
schedule_conservativeness
}
. "
f
"The schedule conservativeness is adjusted to
{
self
.
schedule_conservativeness
}
. "
"Data parallel size is adjusted to be the same as tensor parallel size. "
"Data parallel size is adjusted to be the same as tensor parallel size. "
"Overlap schedule is disabled."
"Overlap schedule
r
is disabled."
)
)
# GGUF
# GGUF
...
@@ -642,9 +642,9 @@ class ServerArgs:
...
@@ -642,9 +642,9 @@ class ServerArgs:
help
=
"Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed."
,
help
=
"Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--disable-disk-cache"
,
"--disable-
outlines-
disk-cache"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Disable disk cache to avoid possible crashes related to file system or high concurrency."
,
help
=
"Disable disk cache
of outlines
to avoid possible crashes related to file system or high concurrency."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--disable-custom-all-reduce"
,
"--disable-custom-all-reduce"
,
...
@@ -745,6 +745,11 @@ class ServerArgs:
...
@@ -745,6 +745,11 @@ class ServerArgs:
action
=
DeprecatedAction
,
action
=
DeprecatedAction
,
help
=
"'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead."
,
help
=
"'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead."
,
)
)
parser
.
add_argument
(
"--disable-disk-cache"
,
action
=
DeprecatedAction
,
help
=
"'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead."
,
)
@
classmethod
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment