Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
35759efa
"vscode:/vscode.git/clone" did not exist on "bbe41ce41a49099097f30fcdb59f08e707d166e1"
Unverified
Commit
35759efa
authored
Jul 20, 2024
by
Lianmin Zheng
Committed by
GitHub
Jul 20, 2024
Browse files
Support random dataset in bench_serving.py (#669)
parent
8f4b1559
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
82 additions
and
15 deletions
+82
-15
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+76
-9
python/sglang/srt/managers/controller/model_runner.py
python/sglang/srt/managers/controller/model_runner.py
+1
-1
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+1
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+4
-4
No files found.
python/sglang/bench_serving.py
View file @
35759efa
...
@@ -273,6 +273,37 @@ def sample_sharegpt_requests(
...
@@ -273,6 +273,37 @@ def sample_sharegpt_requests(
return
filtered_dataset
return
filtered_dataset
def
sample_random_requests
(
input_len
:
int
,
output_len
:
int
,
num_prompts
:
int
,
range_ratio
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
input_lens
=
np
.
random
.
randint
(
int
(
input_len
*
range_ratio
),
input_len
+
1
,
size
=
num_prompts
,
)
output_lens
=
np
.
random
.
randint
(
int
(
output_len
*
range_ratio
),
output_len
+
1
,
size
=
num_prompts
,
)
offsets
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
num_prompts
)
input_requests
=
[]
for
i
in
range
(
num_prompts
):
prompt
=
tokenizer
.
decode
(
[(
offsets
[
i
]
+
i
+
j
)
%
tokenizer
.
vocab_size
for
j
in
range
(
input_lens
[
i
])]
)
input_requests
.
append
((
prompt
,
int
(
input_lens
[
i
]),
int
(
output_lens
[
i
])))
print
(
f
"#Input tokens:
{
np
.
sum
(
input_lens
)
}
"
)
print
(
f
"#Output tokens:
{
np
.
sum
(
output_lens
)
}
"
)
return
input_requests
async
def
get_request
(
async
def
get_request
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
request_rate
:
float
,
request_rate
:
float
,
...
@@ -530,13 +561,23 @@ def fire(args: argparse.Namespace):
...
@@ -530,13 +561,23 @@ def fire(args: argparse.Namespace):
tokenizer
=
get_tokenizer
(
tokenizer_id
)
tokenizer
=
get_tokenizer
(
tokenizer_id
)
assert
args
.
dataset
is
not
None
if
args
.
dataset_name
==
"sharegpt"
:
input_requests
=
sample_sharegpt_requests
(
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset
,
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
)
elif
args
.
dataset_name
==
"random"
:
input_requests
=
sample_random_requests
(
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
num_prompts
=
args
.
num_prompts
,
range_ratio
=
args
.
random_range_ratio
,
tokenizer
=
tokenizer
,
)
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
asyncio
.
run
(
asyncio
.
run
(
benchmark
(
benchmark
(
...
@@ -589,7 +630,14 @@ if __name__ == "__main__":
...
@@ -589,7 +630,14 @@ if __name__ == "__main__":
help
=
"If not set, the default port is configured according to its default value for different LLM Inference Engines."
,
help
=
"If not set, the default port is configured according to its default value for different LLM Inference Engines."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"sharegpt"
,
help
=
"Path to the ShareGPT dataset"
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"random"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
""
,
help
=
"Path to the dataset."
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--model"
,
"--model"
,
...
@@ -613,10 +661,29 @@ if __name__ == "__main__":
...
@@ -613,10 +661,29 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
help
=
"Output length for each request. Overrides the output length from the ShareGPT dataset."
,
)
)
parser
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random dataset."
,
)
parser
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random dataset."
,
)
parser
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
1.0
,
help
=
"Range of sampled ratio of input/output length, "
"used only for random dataset."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--request-rate"
,
"--request-rate"
,
type
=
float
,
type
=
float
,
default
=
128.0
,
default
=
float
(
"inf"
)
,
help
=
"Number of requests per second. If this is inf, then all the requests are sent at time 0. "
help
=
"Number of requests per second. If this is inf, then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0."
,
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0."
,
)
)
...
...
python/sglang/srt/managers/controller/model_runner.py
View file @
35759efa
...
@@ -233,7 +233,7 @@ class ModelRunner:
...
@@ -233,7 +233,7 @@ class ModelRunner:
return
return
logger
.
info
(
f
"[gpu_id=
{
self
.
gpu_id
}
] Capture cuda graph begin."
)
logger
.
info
(
f
"[gpu_id=
{
self
.
gpu_id
}
] Capture cuda graph begin."
)
batch_size_list
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
1
6
)]
batch_size_list
=
[
1
,
2
,
4
]
+
[
i
*
8
for
i
in
range
(
1
,
1
7
)]
self
.
cuda_graph_runner
=
CudaGraphRunner
(
self
.
cuda_graph_runner
=
CudaGraphRunner
(
self
,
max_batch_size_to_capture
=
max
(
batch_size_list
)
self
,
max_batch_size_to_capture
=
max
(
batch_size_list
)
)
)
...
...
python/sglang/srt/managers/io_struct.py
View file @
35759efa
...
@@ -40,7 +40,7 @@ class GenerateReqInput:
...
@@ -40,7 +40,7 @@ class GenerateReqInput:
self
.
text
is
not
None
and
self
.
input_ids
is
not
None
self
.
text
is
not
None
and
self
.
input_ids
is
not
None
):
):
raise
ValueError
(
"Either text or input_ids should be provided."
)
raise
ValueError
(
"Either text or input_ids should be provided."
)
if
"n"
in
self
.
sampling_params
and
self
.
sampling_params
[
"n"
]
!=
1
:
if
self
.
sampling_params
.
get
(
"n"
,
1
)
!=
1
:
is_single
=
False
is_single
=
False
else
:
else
:
if
self
.
text
is
not
None
:
if
self
.
text
is
not
None
:
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
35759efa
...
@@ -196,14 +196,14 @@ class TokenizerManager:
...
@@ -196,14 +196,14 @@ class TokenizerManager:
event
=
asyncio
.
Event
()
event
=
asyncio
.
Event
()
state
=
ReqState
([],
False
,
event
)
state
=
ReqState
([],
False
,
event
)
self
.
rid_to_state
[
rid
]
=
state
self
.
rid_to_state
[
rid
]
=
state
if
is_prefill
==
False
:
if
is_prefill
:
await
self
.
_wait_for_prefill_response
(
event
,
state
,
obj
,
request
,
rid
)
yield
input_ids
else
:
async
for
response
in
self
.
_wait_for_response
(
async
for
response
in
self
.
_wait_for_response
(
event
,
state
,
obj
,
rid
,
request
event
,
state
,
obj
,
rid
,
request
):
):
yield
response
yield
response
else
:
await
self
.
_wait_for_prefill_response
(
event
,
state
,
obj
,
request
,
rid
)
yield
input_ids
async
def
_handle_batch_request
(
self
,
obj
,
request
):
async
def
_handle_batch_request
(
self
,
obj
,
request
):
batch_size
=
obj
.
batch_size
batch_size
=
obj
.
batch_size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment