Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
d198791f
"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "e7534542a2e736ab54328a7fb3a0a15fe4f31da2"
Unverified
Commit
d198791f
authored
Jul 22, 2024
by
zhyncs
Committed by
GitHub
Jul 22, 2024
Browse files
misc: update output token logic (#695)
parent
c07526e4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
7 deletions
+15
-7
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+15
-7
No files found.
python/sglang/bench_serving.py
View file @
d198791f
...
@@ -54,6 +54,7 @@ class RequestFuncOutput:
...
@@ -54,6 +54,7 @@ class RequestFuncOutput:
itl
:
List
[
float
]
=
field
(
default_factory
=
list
)
# List of inter-token latencies
itl
:
List
[
float
]
=
field
(
default_factory
=
list
)
# List of inter-token latencies
prompt_len
:
int
=
0
prompt_len
:
int
=
0
error
:
str
=
""
error
:
str
=
""
output_len
:
int
=
0
def
remove_prefix
(
text
:
str
,
prefix
:
str
)
->
str
:
def
remove_prefix
(
text
:
str
,
prefix
:
str
)
->
str
:
...
@@ -189,6 +190,7 @@ async def async_request_openai_completions(
...
@@ -189,6 +190,7 @@ async def async_request_openai_completions(
output
.
generated_text
=
generated_text
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
success
=
True
output
.
latency
=
latency
output
.
latency
=
latency
output
.
output_len
=
request_func_input
.
output_len
else
:
else
:
output
.
error
=
response
.
reason
or
""
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
output
.
success
=
False
...
@@ -451,6 +453,7 @@ def calculate_metrics(
...
@@ -451,6 +453,7 @@ def calculate_metrics(
outputs
:
List
[
RequestFuncOutput
],
outputs
:
List
[
RequestFuncOutput
],
dur_s
:
float
,
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
backend
:
str
,
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
actual_output_lens
:
List
[
int
]
=
[]
actual_output_lens
:
List
[
int
]
=
[]
total_input
=
0
total_input
=
0
...
@@ -460,13 +463,16 @@ def calculate_metrics(
...
@@ -460,13 +463,16 @@ def calculate_metrics(
ttfts
:
List
[
float
]
=
[]
ttfts
:
List
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
if
outputs
[
i
].
success
:
# We use the tokenizer to count the number of output tokens for all
# We use the tokenizer solely to count output tokens for the TensorRT LLM backend,
# serving backends instead of looking at len(outputs[i].itl) since
# as it lacks `ignore_eos` support.
# multiple output tokens may be bundled together
if
backend
==
"trt"
:
# Note : this may inflate the output token count slightly
output_len
=
len
(
output_len
=
len
(
tokenizer
(
tokenizer
(
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
).
input_ids
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
)
).
input_ids
)
else
:
output_len
=
outputs
[
i
].
output_len
actual_output_lens
.
append
(
output_len
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
total_input
+=
input_requests
[
i
][
1
]
if
output_len
>
1
:
if
output_len
>
1
:
...
@@ -571,9 +577,11 @@ async def benchmark(
...
@@ -571,9 +577,11 @@ async def benchmark(
outputs
=
outputs
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
backend
=
backend
,
)
)
print
(
"
\n
{s:{c}^{n}}"
.
format
(
s
=
" Serving Benchmark Result "
,
n
=
50
,
c
=
"="
))
print
(
"
\n
{s:{c}^{n}}"
.
format
(
s
=
" Serving Benchmark Result "
,
n
=
50
,
c
=
"="
))
print
(
"{:<40} {:<10}"
.
format
(
"Backend:"
,
backend
))
print
(
"{:<40} {:<10}"
.
format
(
"Traffic request rate:"
,
request_rate
))
print
(
"{:<40} {:<10}"
.
format
(
"Traffic request rate:"
,
request_rate
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment