Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bdac8f06
Commit
bdac8f06
authored
Aug 03, 2024
by
zhuwenwen
Browse files
update benchmraks and tests
parent
ffbef65c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
48 additions
and
23 deletions
+48
-23
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+40
-19
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+8
-4
No files found.
benchmarks/benchmark_throughput.py
View file @
bdac8f06
...
@@ -62,6 +62,7 @@ def sample_requests(
...
@@ -62,6 +62,7 @@ def sample_requests(
def
run_vllm
(
def
run_vllm
(
warmup_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
model
:
str
,
tokenizer
:
str
,
tokenizer
:
str
,
...
@@ -122,21 +123,37 @@ def run_vllm(
...
@@ -122,21 +123,37 @@ def run_vllm(
))
))
# warmup
# warmup
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
warmup_prompts
=
[]
size
=
(
args
.
num_prompts
,
warmup_sampling_params
=
[]
args
.
input_len
))
for
prompt
,
_
,
output_len
in
warmup_requests
:
dummy_inputs
:
List
[
PromptStrictInputs
]
=
[{
warmup_prompts
.
append
(
prompt
)
"prompt_token_ids"
:
batch
warmup_sampling_params
.
append
(
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
SamplingParams
(
n
=
n
,
def
run_to_completion
():
temperature
=
0.0
if
use_beam_search
else
1.0
,
llm
.
generate
(
dummy_inputs
,
top_p
=
1.0
,
sampling_params
=
sampling_params
,
use_beam_search
=
use_beam_search
,
use_tqdm
=
False
)
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
print
(
"Warming up..."
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
warmup_prompts
,
warmup_sampling_params
,
use_tqdm
=
True
)
run_to_completion
()
# dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts,
# args.input_len))
# dummy_inputs: List[PromptStrictInputs] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion():
# llm.generate(dummy_inputs,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion()
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
...
@@ -231,6 +248,10 @@ def main(args: argparse.Namespace):
...
@@ -231,6 +248,10 @@ def main(args: argparse.Namespace):
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
None
:
if
args
.
dataset
is
None
:
# Synthesize a prompt with the given input length.
# Synthesize a prompt with the given input length.
warmup_prompt
=
"hi"
*
10
warmup_requests
=
[(
warmup_prompt
,
10
,
10
)
for
_
in
range
(
1
)]
prompt
=
"hi"
*
(
args
.
input_len
-
1
)
prompt
=
"hi"
*
(
args
.
input_len
-
1
)
requests
=
[(
prompt
,
args
.
input_len
,
args
.
output_len
)
requests
=
[(
prompt
,
args
.
input_len
,
args
.
output_len
)
for
_
in
range
(
args
.
num_prompts
)]
for
_
in
range
(
args
.
num_prompts
)]
...
@@ -240,7 +261,7 @@ def main(args: argparse.Namespace):
...
@@ -240,7 +261,7 @@ def main(args: argparse.Namespace):
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
elapsed_time
=
run_vllm
(
elapsed_time
=
run_vllm
(
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
warmup_requests
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
...
@@ -314,10 +335,10 @@ if __name__ == "__main__":
...
@@ -314,10 +335,10 @@ if __name__ == "__main__":
default
=
1
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num-iters-warmup'
,
#
parser.add_argument('--num-iters-warmup',
type
=
int
,
#
type=int,
default
=
1
,
#
default=1,
help
=
'Number of iterations to run for warmup.'
)
#
help='Number of iterations to run for warmup.')
parser
.
add_argument
(
"--num-prompts"
,
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
type
=
int
,
default
=
1000
,
default
=
1000
,
...
...
tests/basic_correctness/test_preemption.py
View file @
bdac8f06
...
@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute(
...
@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption
(
def
test_preemption
(
caplog_vllm
,
caplog_vllm
,
...
@@ -118,7 +119,8 @@ def test_preemption(
...
@@ -118,7 +119,8 @@ def test_preemption(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap
(
def
test_swap
(
...
@@ -176,7 +178,8 @@ def test_swap(
...
@@ -176,7 +178,8 @@ def test_swap(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap_infeasible
(
def
test_swap_infeasible
(
...
@@ -220,7 +223,8 @@ def test_swap_infeasible(
...
@@ -220,7 +223,8 @@ def test_swap_infeasible(
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
# @pytest.mark.parametrize("dtype", ["float"])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption_infeasible
(
def
test_preemption_infeasible
(
vllm_runner
,
vllm_runner
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment