Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
8f2c522a
"git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "4d6cf7606c4b1c98994dc448195ce97758fd4dee"
Unverified
Commit
8f2c522a
authored
Jan 16, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 16, 2025
Browse files
Improve benchmark scripts and error message printing (#2922)
parent
75964177
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
110 additions
and
55 deletions
+110
-55
python/sglang/bench_offline_throughput.py
python/sglang/bench_offline_throughput.py
+22
-15
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+37
-28
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+6
-0
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+2
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+34
-7
python/sglang/srt/server.py
python/sglang/srt/server.py
+6
-2
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+1
-0
test/srt/test_moe_ep.py
test/srt/test_moe_ep.py
+2
-2
No files found.
python/sglang/bench_offline_throughput.py
View file @
8f2c522a
...
@@ -39,14 +39,15 @@ class BenchArgs:
...
@@ -39,14 +39,15 @@ class BenchArgs:
dataset_path
:
str
=
""
dataset_path
:
str
=
""
num_prompts
:
int
=
1000
num_prompts
:
int
=
1000
sharegpt_output_len
:
Optional
[
int
]
=
None
sharegpt_output_len
:
Optional
[
int
]
=
None
sharegpt_context_len
:
Optional
[
int
]
=
None
random_input_len
:
int
=
1024
random_input_len
:
int
=
1024
random_output_len
:
int
=
1024
random_output_len
:
int
=
1024
random_range_ratio
:
float
=
0.0
random_range_ratio
:
float
=
0.0
g
en
_num_groups
:
int
=
64
g
sp
_num_groups
:
int
=
64
g
en
_prompts_per_group
:
int
=
16
g
sp
_prompts_per_group
:
int
=
16
g
en
_system_prompt_len
:
int
=
2048
g
sp
_system_prompt_len
:
int
=
2048
g
en
_question_len
:
int
=
128
g
sp
_question_len
:
int
=
128
g
en
_output_len
:
int
=
256
g
sp
_output_len
:
int
=
256
disable_ignore_eos
:
bool
=
False
disable_ignore_eos
:
bool
=
False
extra_request_body
:
Optional
[
str
]
=
None
extra_request_body
:
Optional
[
str
]
=
None
seed
:
int
=
1
seed
:
int
=
1
...
@@ -82,6 +83,12 @@ class BenchArgs:
...
@@ -82,6 +83,12 @@ class BenchArgs:
default
=
BenchArgs
.
sharegpt_output_len
,
default
=
BenchArgs
.
sharegpt_output_len
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
)
)
parser
.
add_argument
(
"--sharegpt-context-len"
,
type
=
int
,
default
=
BenchArgs
.
sharegpt_context_len
,
help
=
"The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--random-input-len"
,
"--random-input-len"
,
type
=
int
,
type
=
int
,
...
@@ -102,35 +109,35 @@ class BenchArgs:
...
@@ -102,35 +109,35 @@ class BenchArgs:
"used only for random dataset."
,
"used only for random dataset."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--g
en
-num-groups"
,
"--g
sp
-num-groups"
,
type
=
int
,
type
=
int
,
default
=
BenchArgs
.
g
en
_num_groups
,
default
=
BenchArgs
.
g
sp
_num_groups
,
help
=
"Number of groups with shared prefix, used"
help
=
"Number of groups with shared prefix, used"
"only for generate-shared-prefix"
,
"only for generate-shared-prefix"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--g
en
-prompts-per-group"
,
"--g
sp
-prompts-per-group"
,
type
=
int
,
type
=
int
,
default
=
BenchArgs
.
g
en
_prompts_per_group
,
default
=
BenchArgs
.
g
sp
_prompts_per_group
,
help
=
"Number of prompts per group of shared prefix, used"
help
=
"Number of prompts per group of shared prefix, used"
"only for generate-shared-prefix"
,
"only for generate-shared-prefix"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--g
en
-system-prompt-len"
,
"--g
sp
-system-prompt-len"
,
type
=
int
,
type
=
int
,
default
=
BenchArgs
.
g
en
_system_prompt_len
,
default
=
BenchArgs
.
g
sp
_system_prompt_len
,
help
=
"System prompt length, used"
"only for generate-shared-prefix"
,
help
=
"System prompt length, used"
"only for generate-shared-prefix"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--g
en
-question-len"
,
"--g
sp
-question-len"
,
type
=
int
,
type
=
int
,
default
=
BenchArgs
.
g
en
_question_len
,
default
=
BenchArgs
.
g
sp
_question_len
,
help
=
"Question length, used"
"only for generate-shared-prefix"
,
help
=
"Question length, used"
"only for generate-shared-prefix"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--g
en
-output-len"
,
"--g
sp
-output-len"
,
type
=
int
,
type
=
int
,
default
=
BenchArgs
.
g
en
_output_len
,
default
=
BenchArgs
.
g
sp
_output_len
,
help
=
"Target length in tokens for outputs in generated-shared-prefix dataset"
,
help
=
"Target length in tokens for outputs in generated-shared-prefix dataset"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
python/sglang/bench_serving.py
View file @
8f2c522a
...
@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
...
@@ -452,6 +452,7 @@ def get_dataset(args, tokenizer):
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
fixed_output_len
=
args
.
sharegpt_output_len
,
context_len
=
args
.
sharegpt_context_len
,
)
)
elif
args
.
dataset_name
==
"random"
:
elif
args
.
dataset_name
==
"random"
:
input_requests
=
sample_random_requests
(
input_requests
=
sample_random_requests
(
...
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
...
@@ -464,11 +465,11 @@ def get_dataset(args, tokenizer):
)
)
elif
args
.
dataset_name
==
"generated-shared-prefix"
:
elif
args
.
dataset_name
==
"generated-shared-prefix"
:
input_requests
=
sample_generated_shared_prefix_requests
(
input_requests
=
sample_generated_shared_prefix_requests
(
num_groups
=
args
.
g
en
_num_groups
,
num_groups
=
args
.
g
sp
_num_groups
,
prompts_per_group
=
args
.
g
en
_prompts_per_group
,
prompts_per_group
=
args
.
g
sp
_prompts_per_group
,
system_prompt_len
=
args
.
g
en
_system_prompt_len
,
system_prompt_len
=
args
.
g
sp
_system_prompt_len
,
question_len
=
args
.
g
en
_question_len
,
question_len
=
args
.
g
sp
_question_len
,
output_len
=
args
.
g
en
_output_len
,
output_len
=
args
.
g
sp
_output_len
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
else
:
else
:
...
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
...
@@ -560,6 +561,7 @@ def sample_sharegpt_requests(
num_requests
:
int
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
fixed_output_len
:
Optional
[
int
]
=
None
,
context_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
raise
ValueError
(
"output_len too small"
)
...
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
...
@@ -597,14 +599,15 @@ def sample_sharegpt_requests(
output_len
=
(
output_len
=
(
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
)
)
if
prompt_len
<
4
or
output_len
<
4
:
if
prompt_len
<
1
or
output_len
<
1
:
# Prune too short sequences.
# Prune too short sequences.
continue
continue
if
prompt_len
>
1024
or
(
prompt_len
+
output_len
>
2048
and
fixed_output_len
is
None
if
context_len
and
prompt_len
+
output_len
>
context_len
:
):
# Prune too long sequences.
# Prune too long sequences.
continue
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
print
(
f
"#Input tokens:
{
np
.
sum
([
x
[
1
]
for
x
in
filtered_dataset
])
}
"
)
print
(
f
"#Input tokens:
{
np
.
sum
([
x
[
1
]
for
x
in
filtered_dataset
])
}
"
)
...
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
...
@@ -706,8 +709,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
# Create a unique cache filename based on the generation parameters
# Create a unique cache filename based on the generation parameters
cache_key
=
(
cache_key
=
(
f
"gen_prefix_
{
args
.
g
en
_num_groups
}
_
{
args
.
g
en
_prompts_per_group
}
_"
f
"gen_
shared_
prefix_
{
args
.
g
sp
_num_groups
}
_
{
args
.
g
sp
_prompts_per_group
}
_"
f
"
{
args
.
g
en
_system_prompt_len
}
_
{
args
.
g
en
_question_len
}
_
{
args
.
g
en
_output_len
}
_"
f
"
{
args
.
g
sp
_system_prompt_len
}
_
{
args
.
g
sp
_question_len
}
_
{
args
.
g
sp
_output_len
}
_"
f
"
{
tokenizer
.
__class__
.
__name__
}
.pkl"
f
"
{
tokenizer
.
__class__
.
__name__
}
.pkl"
)
)
return
cache_dir
/
cache_key
return
cache_dir
/
cache_key
...
@@ -1374,6 +1377,12 @@ if __name__ == "__main__":
...
@@ -1374,6 +1377,12 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
)
)
parser
.
add_argument
(
"--sharegpt-context-len"
,
type
=
int
,
default
=
None
,
help
=
"The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--random-input-len"
,
"--random-input-len"
,
type
=
int
,
type
=
int
,
...
@@ -1453,49 +1462,49 @@ if __name__ == "__main__":
...
@@ -1453,49 +1462,49 @@ if __name__ == "__main__":
help
=
"Append given JSON object to the request payload. You can use this to specify"
help
=
"Append given JSON object to the request payload. You can use this to specify"
"additional generate params like sampling params."
,
"additional generate params like sampling params."
,
)
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler. The endpoint must be launched with "
"SGLANG_TORCH_PROFILER_DIR to enable profiler."
,
)
parser
.
add_argument
(
"--lora-name"
,
type
=
str
,
default
=
None
,
help
=
"The name of LoRA adapter"
,
)
group
=
parser
.
add_argument_group
(
"generated-shared-prefix dataset arguments"
)
group
=
parser
.
add_argument_group
(
"generated-shared-prefix dataset arguments"
)
group
.
add_argument
(
group
.
add_argument
(
"--g
en
-num-groups"
,
"--g
sp
-num-groups"
,
type
=
int
,
type
=
int
,
default
=
64
,
default
=
64
,
help
=
"Number of system prompt groups for generated-shared-prefix dataset"
,
help
=
"Number of system prompt groups for generated-shared-prefix dataset"
,
)
)
group
.
add_argument
(
group
.
add_argument
(
"--g
en
-prompts-per-group"
,
"--g
sp
-prompts-per-group"
,
type
=
int
,
type
=
int
,
default
=
16
,
default
=
16
,
help
=
"Number of prompts per system prompt group for generated-shared-prefix dataset"
,
help
=
"Number of prompts per system prompt group for generated-shared-prefix dataset"
,
)
)
group
.
add_argument
(
group
.
add_argument
(
"--g
en
-system-prompt-len"
,
"--g
sp
-system-prompt-len"
,
type
=
int
,
type
=
int
,
default
=
2048
,
default
=
2048
,
help
=
"Target length in tokens for system prompts in generated-shared-prefix dataset"
,
help
=
"Target length in tokens for system prompts in generated-shared-prefix dataset"
,
)
)
group
.
add_argument
(
group
.
add_argument
(
"--g
en
-question-len"
,
"--g
sp
-question-len"
,
type
=
int
,
type
=
int
,
default
=
128
,
default
=
128
,
help
=
"Target length in tokens for questions in generated-shared-prefix dataset"
,
help
=
"Target length in tokens for questions in generated-shared-prefix dataset"
,
)
)
group
.
add_argument
(
group
.
add_argument
(
"--g
en
-output-len"
,
"--g
sp
-output-len"
,
type
=
int
,
type
=
int
,
default
=
256
,
default
=
256
,
help
=
"Target length in tokens for outputs in generated-shared-prefix dataset"
,
help
=
"Target length in tokens for outputs in generated-shared-prefix dataset"
,
)
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler. The endpoint must be launched with "
"SGLANG_TORCH_PROFILER_DIR to enable profiler."
,
)
parser
.
add_argument
(
"--lora-name"
,
type
=
str
,
default
=
None
,
help
=
"The name of LoRA adapter"
,
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
run_benchmark
(
args
)
run_benchmark
(
args
)
python/sglang/srt/managers/io_struct.py
View file @
8f2c522a
...
@@ -59,6 +59,9 @@ class GenerateReqInput:
...
@@ -59,6 +59,9 @@ class GenerateReqInput:
return_text_in_logprobs
:
bool
=
False
return_text_in_logprobs
:
bool
=
False
# Whether to stream output.
# Whether to stream output.
stream
:
bool
=
False
stream
:
bool
=
False
# Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
log_metrics
:
bool
=
True
# The modalities of the image data [image, multi-images, video]
# The modalities of the image data [image, multi-images, video]
modalities
:
Optional
[
List
[
str
]]
=
None
modalities
:
Optional
[
List
[
str
]]
=
None
# LoRA related
# LoRA related
...
@@ -196,6 +199,7 @@ class GenerateReqInput:
...
@@ -196,6 +199,7 @@ class GenerateReqInput:
top_logprobs_num
=
self
.
top_logprobs_num
[
i
],
top_logprobs_num
=
self
.
top_logprobs_num
[
i
],
return_text_in_logprobs
=
self
.
return_text_in_logprobs
,
return_text_in_logprobs
=
self
.
return_text_in_logprobs
,
stream
=
self
.
stream
,
stream
=
self
.
stream
,
log_metrics
=
self
.
log_metrics
,
modalities
=
self
.
modalities
[
i
]
if
self
.
modalities
else
None
,
modalities
=
self
.
modalities
[
i
]
if
self
.
modalities
else
None
,
lora_path
=
self
.
lora_path
[
i
]
if
self
.
lora_path
is
not
None
else
None
,
lora_path
=
self
.
lora_path
[
i
]
if
self
.
lora_path
is
not
None
else
None
,
)
)
...
@@ -243,6 +247,8 @@ class EmbeddingReqInput:
...
@@ -243,6 +247,8 @@ class EmbeddingReqInput:
sampling_params
:
Union
[
List
[
Dict
],
Dict
]
=
None
sampling_params
:
Union
[
List
[
Dict
],
Dict
]
=
None
# Dummy input embeds for compatibility
# Dummy input embeds for compatibility
input_embeds
:
Optional
[
Union
[
List
[
List
[
List
[
float
]]],
List
[
List
[
float
]]]]
=
None
input_embeds
:
Optional
[
Union
[
List
[
List
[
List
[
float
]]],
List
[
List
[
float
]]]]
=
None
# Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
log_metrics
:
bool
=
True
def
normalize_batch_and_arguments
(
self
):
def
normalize_batch_and_arguments
(
self
):
if
(
self
.
text
is
None
and
self
.
input_ids
is
None
)
or
(
if
(
self
.
text
is
None
and
self
.
input_ids
is
None
)
or
(
...
...
python/sglang/srt/managers/scheduler.py
View file @
8f2c522a
...
@@ -631,7 +631,8 @@ class Scheduler:
...
@@ -631,7 +631,8 @@ class Scheduler:
if
len
(
req
.
origin_input_ids
)
>
self
.
max_req_input_len
:
if
len
(
req
.
origin_input_ids
)
>
self
.
max_req_input_len
:
logger
.
warning
(
logger
.
warning
(
"Request length is longer than the KV cache pool size or "
"Request length is longer than the KV cache pool size or "
"the max context length. Truncated!!!"
"the max context length. Truncated. "
f
"
{
len
(
req
.
origin_input_ids
)
=
}
,
{
self
.
max_req_input_len
=
}
."
)
)
req
.
origin_input_ids
=
req
.
origin_input_ids
[:
self
.
max_req_input_len
]
req
.
origin_input_ids
=
req
.
origin_input_ids
[:
self
.
max_req_input_len
]
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
8f2c522a
...
@@ -79,6 +79,7 @@ from sglang.srt.utils import (
...
@@ -79,6 +79,7 @@ from sglang.srt.utils import (
get_zmq_socket
,
get_zmq_socket
,
kill_process_tree
,
kill_process_tree
,
)
)
from
sglang.utils
import
get_exception_traceback
asyncio
.
set_event_loop_policy
(
uvloop
.
EventLoopPolicy
())
asyncio
.
set_event_loop_policy
(
uvloop
.
EventLoopPolicy
())
...
@@ -640,7 +641,9 @@ class TokenizerManager:
...
@@ -640,7 +641,9 @@ class TokenizerManager:
self
.
to_create_loop
=
False
self
.
to_create_loop
=
False
loop
=
asyncio
.
get_event_loop
()
loop
=
asyncio
.
get_event_loop
()
self
.
asyncio_tasks
.
add
(
loop
.
create_task
(
self
.
handle_loop
()))
self
.
asyncio_tasks
.
add
(
loop
.
create_task
(
print_exception_wrapper
(
self
.
handle_loop
))
)
# We cannot add signal handler when the tokenizer manager is not in
# We cannot add signal handler when the tokenizer manager is not in
# the main thread due to the CPython limitation.
# the main thread due to the CPython limitation.
...
@@ -653,7 +656,9 @@ class TokenizerManager:
...
@@ -653,7 +656,9 @@ class TokenizerManager:
"not in the main thread. This disables graceful shutdown of the "
"not in the main thread. This disables graceful shutdown of the "
"tokenizer manager when SIGTERM is received."
"tokenizer manager when SIGTERM is received."
)
)
self
.
asyncio_tasks
.
add
(
loop
.
create_task
(
self
.
sigterm_watchdog
()))
self
.
asyncio_tasks
.
add
(
loop
.
create_task
(
print_exception_wrapper
(
self
.
sigterm_watchdog
))
)
async
def
sigterm_watchdog
(
self
):
async
def
sigterm_watchdog
(
self
):
while
not
self
.
gracefully_exit
:
while
not
self
.
gracefully_exit
:
...
@@ -738,9 +743,13 @@ class TokenizerManager:
...
@@ -738,9 +743,13 @@ class TokenizerManager:
state
.
finished
=
recv_obj
.
finished_reasons
[
i
]
is
not
None
state
.
finished
=
recv_obj
.
finished_reasons
[
i
]
is
not
None
state
.
event
.
set
()
state
.
event
.
set
()
if
self
.
enable_metrics
:
if
self
.
enable_metrics
and
state
.
obj
.
log_metrics
:
self
.
collect_metrics
(
state
,
recv_obj
,
i
)
self
.
collect_metrics
(
state
,
recv_obj
,
i
)
if
self
.
dump_requests_folder
and
state
.
finished
:
if
(
self
.
dump_requests_folder
and
state
.
finished
and
state
.
obj
.
log_metrics
):
self
.
dump_requests
(
state
,
out_dict
)
self
.
dump_requests
(
state
,
out_dict
)
elif
isinstance
(
recv_obj
,
OpenSessionReqOutput
):
elif
isinstance
(
recv_obj
,
OpenSessionReqOutput
):
self
.
session_futures
[
recv_obj
.
session_id
].
set_result
(
self
.
session_futures
[
recv_obj
.
session_id
].
set_result
(
...
@@ -887,20 +896,38 @@ class TokenizerManager:
...
@@ -887,20 +896,38 @@ class TokenizerManager:
)
)
if
len
(
self
.
dump_request_list
)
>=
self
.
dump_requests_threshold
:
if
len
(
self
.
dump_request_list
)
>=
self
.
dump_requests_threshold
:
filename
=
os
.
path
.
join
(
self
.
dump_requests_folder
,
datetime
.
now
().
strftime
(
"%Y-%m-%d_%H-%M-%S"
)
+
".pkl"
,
)
logger
.
info
(
f
"Dump
{
len
(
self
.
dump_request_list
)
}
requests to
{
filename
}
"
)
to_dump
=
self
.
dump_request_list
to_dump
=
self
.
dump_request_list
self
.
dump_request_list
=
[]
self
.
dump_request_list
=
[]
def
background_task
():
def
background_task
():
os
.
makedirs
(
self
.
dump_requests_folder
,
exist_ok
=
True
)
os
.
makedirs
(
self
.
dump_requests_folder
,
exist_ok
=
True
)
current_time
=
datetime
.
now
()
with
open
(
filename
,
"wb"
)
as
f
:
filename
=
current_time
.
strftime
(
"%Y-%m-%d_%H-%M-%S"
)
+
".pkl"
with
open
(
os
.
path
.
join
(
self
.
dump_requests_folder
,
filename
),
"wb"
)
as
f
:
pickle
.
dump
(
to_dump
,
f
)
pickle
.
dump
(
to_dump
,
f
)
# Schedule the task to run in the background without awaiting it
# Schedule the task to run in the background without awaiting it
asyncio
.
create_task
(
asyncio
.
to_thread
(
background_task
))
asyncio
.
create_task
(
asyncio
.
to_thread
(
background_task
))
async
def
print_exception_wrapper
(
func
):
"""
Sometimes an asyncio function does not print exception.
We do another wrapper to handle the exception.
"""
try
:
await
func
()
except
Exception
:
traceback
=
get_exception_traceback
()
logger
.
error
(
f
"TokenizerManager hit an exception:
{
traceback
}
"
)
kill_process_tree
(
os
.
getpid
(),
include_parent
=
True
)
sys
.
exit
(
1
)
class
SignalHandler
:
class
SignalHandler
:
def
__init__
(
self
,
tokenizer_manager
):
def
__init__
(
self
,
tokenizer_manager
):
self
.
tokenizer_manager
=
tokenizer_manager
self
.
tokenizer_manager
=
tokenizer_manager
...
...
python/sglang/srt/server.py
View file @
8f2c522a
...
@@ -135,9 +135,13 @@ async def health_generate(request: Request) -> Response:
...
@@ -135,9 +135,13 @@ async def health_generate(request: Request) -> Response:
sampling_params
=
{
"max_new_tokens"
:
1
,
"temperature"
:
0.7
}
sampling_params
=
{
"max_new_tokens"
:
1
,
"temperature"
:
0.7
}
if
tokenizer_manager
.
is_generation
:
if
tokenizer_manager
.
is_generation
:
gri
=
GenerateReqInput
(
input_ids
=
[
0
],
sampling_params
=
sampling_params
)
gri
=
GenerateReqInput
(
input_ids
=
[
0
],
sampling_params
=
sampling_params
,
log_metrics
=
False
)
else
:
else
:
gri
=
EmbeddingReqInput
(
input_ids
=
[
0
],
sampling_params
=
sampling_params
)
gri
=
EmbeddingReqInput
(
input_ids
=
[
0
],
sampling_params
=
sampling_params
,
log_metrics
=
False
)
try
:
try
:
async
for
_
in
tokenizer_manager
.
generate_request
(
gri
,
request
):
async
for
_
in
tokenizer_manager
.
generate_request
(
gri
,
request
):
...
...
python/sglang/test/test_utils.py
View file @
8f2c522a
...
@@ -560,6 +560,7 @@ def run_bench_serving(
...
@@ -560,6 +560,7 @@ def run_bench_serving(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
num_prompts
=
num_prompts
,
num_prompts
=
num_prompts
,
sharegpt_output_len
=
None
,
sharegpt_output_len
=
None
,
sharegpt_context_len
=
None
,
random_input_len
=
random_input_len
,
random_input_len
=
random_input_len
,
random_output_len
=
random_output_len
,
random_output_len
=
random_output_len
,
random_range_ratio
=
0.0
,
random_range_ratio
=
0.0
,
...
...
test/srt/test_moe_ep.py
View file @
8f2c522a
...
@@ -44,7 +44,7 @@ class TestEpMoE(unittest.TestCase):
...
@@ -44,7 +44,7 @@ class TestEpMoE(unittest.TestCase):
)
)
metrics
=
run_eval
(
args
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.5
self
.
assert
Greater
(
metrics
[
"score"
]
,
0.5
)
def
test_mgsm_en
(
self
):
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
...
@@ -56,7 +56,7 @@ class TestEpMoE(unittest.TestCase):
...
@@ -56,7 +56,7 @@ class TestEpMoE(unittest.TestCase):
)
)
metrics
=
run_eval
(
args
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.8
self
.
assert
Greater
(
metrics
[
"score"
]
,
0.8
)
class
TestEpMoEFP8
(
unittest
.
TestCase
):
class
TestEpMoEFP8
(
unittest
.
TestCase
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment