Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
2709c000
Unverified
Commit
2709c000
authored
Jan 19, 2024
by
Harry Mellor
Committed by
GitHub
Jan 18, 2024
Browse files
Support OpenAI API server in `benchmark_serving.py` (#2172)
parent
dd7e8f5f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
51 additions
and
32 deletions
+51
-32
.gitignore
.gitignore
+3
-0
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+48
-32
No files found.
.gitignore
View file @
2709c000
...
...
@@ -181,3 +181,6 @@ _build/
# hip files generated by PyTorch
*.hip
*_hip*
# Benchmark dataset
*.json
benchmarks/benchmark_serving.py
View file @
2709c000
...
...
@@ -24,6 +24,7 @@ from typing import AsyncGenerator, List, Tuple
import
aiohttp
import
numpy
as
np
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
...
@@ -40,15 +41,10 @@ def sample_requests(
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Only keep the first two turns of each conversation.
dataset
=
[
(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# Tokenize the prompts and completions.
prompts
=
[
prompt
for
prompt
,
_
in
dataset
]
...
...
@@ -98,6 +94,7 @@ async def get_request(
async
def
send_request
(
backend
:
str
,
model
:
str
,
api_url
:
str
,
prompt
:
str
,
prompt_len
:
int
,
...
...
@@ -120,6 +117,8 @@ async def send_request(
"ignore_eos"
:
True
,
"stream"
:
False
,
}
if
model
is
not
None
:
pload
[
"model"
]
=
model
elif
backend
==
"tgi"
:
assert
not
use_beam_search
params
=
{
...
...
@@ -137,7 +136,8 @@ async def send_request(
timeout
=
aiohttp
.
ClientTimeout
(
total
=
3
*
3600
)
async
with
aiohttp
.
ClientSession
(
timeout
=
timeout
)
as
session
:
while
True
:
async
with
session
.
post
(
api_url
,
headers
=
headers
,
json
=
pload
)
as
response
:
async
with
session
.
post
(
api_url
,
headers
=
headers
,
json
=
pload
)
as
response
:
chunks
=
[]
async
for
chunk
,
_
in
response
.
content
.
iter_chunks
():
chunks
.
append
(
chunk
)
...
...
@@ -155,6 +155,7 @@ async def send_request(
async
def
benchmark
(
backend
:
str
,
model
:
str
,
api_url
:
str
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
best_of
:
int
,
...
...
@@ -164,11 +165,11 @@ async def benchmark(
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
prompt
,
prompt_len
,
output_len
=
request
task
=
asyncio
.
create_task
(
send_request
(
backend
,
api_url
,
prompt
,
prompt_len
,
outpu
t_len
,
best_of
,
use_beam_search
))
task
=
asyncio
.
create_task
(
send_request
(
backend
,
model
,
api_url
,
prompt
,
promp
t_len
,
output_len
,
best_of
,
use_beam_search
))
tasks
.
append
(
task
)
await
asyncio
.
gather
(
*
tasks
)
await
tqdm
.
gather
(
*
tasks
)
def
main
(
args
:
argparse
.
Namespace
):
...
...
@@ -176,13 +177,15 @@ def main(args: argparse.Namespace):
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/generate"
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
benchmark_start_time
=
time
.
perf_counter
()
asyncio
.
run
(
benchmark
(
args
.
backend
,
api_url
,
input_requests
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
asyncio
.
run
(
benchmark
(
args
.
backend
,
args
.
model
,
api_url
,
input_requests
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
benchmark_end_time
=
time
.
perf_counter
()
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
...
...
@@ -196,10 +199,8 @@ def main(args: argparse.Namespace):
for
prompt_len
,
output_len
,
latency
in
REQUEST_LATENCY
])
print
(
f
"Average latency per token:
{
avg_per_token_latency
:.
2
f
}
s"
)
avg_per_output_token_latency
=
np
.
mean
([
latency
/
output_len
for
_
,
output_len
,
latency
in
REQUEST_LATENCY
])
avg_per_output_token_latency
=
np
.
mean
(
[
latency
/
output_len
for
_
,
output_len
,
latency
in
REQUEST_LATENCY
])
print
(
"Average latency per output token: "
f
"
{
avg_per_output_token_latency
:.
2
f
}
s"
)
...
...
@@ -207,27 +208,42 @@ def main(args: argparse.Namespace):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
[
"vllm"
,
"tgi"
])
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
required
=
True
,
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/generate"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
required
=
True
,
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
required
=
True
,
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
required
=
True
,
help
=
"Name or path of the tokenizer."
)
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
help
=
"Generates `best_of` sequences per prompt and "
"returns the best one."
)
"returns the best one."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times."
)
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
args
=
parser
.
parse_args
()
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment