Commit bdac8f06 authored by zhuwenwen's avatar zhuwenwen
Browse files

update benchmraks and tests

parent ffbef65c
...@@ -62,6 +62,7 @@ def sample_requests( ...@@ -62,6 +62,7 @@ def sample_requests(
def run_vllm( def run_vllm(
warmup_requests: List[Tuple[str, int, int]],
requests: List[Tuple[str, int, int]], requests: List[Tuple[str, int, int]],
model: str, model: str,
tokenizer: str, tokenizer: str,
...@@ -122,21 +123,37 @@ def run_vllm( ...@@ -122,21 +123,37 @@ def run_vllm(
)) ))
# warmup # warmup
dummy_prompt_token_ids = np.random.randint(10000, warmup_prompts = []
size=(args.num_prompts, warmup_sampling_params = []
args.input_len)) for prompt, _, output_len in warmup_requests:
dummy_inputs: List[PromptStrictInputs] = [{ warmup_prompts.append(prompt)
"prompt_token_ids": batch warmup_sampling_params.append(
} for batch in dummy_prompt_token_ids.tolist()] SamplingParams(
n=n,
def run_to_completion(): temperature=0.0 if use_beam_search else 1.0,
llm.generate(dummy_inputs, top_p=1.0,
sampling_params=sampling_params, use_beam_search=use_beam_search,
use_tqdm=False) ignore_eos=True,
max_tokens=output_len,
))
print("Warming up...") print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
run_to_completion()
# dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts,
# args.input_len))
# dummy_inputs: List[PromptStrictInputs] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion():
# llm.generate(dummy_inputs,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion()
start = time.perf_counter() start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True) llm.generate(prompts, sampling_params, use_tqdm=True)
...@@ -231,6 +248,10 @@ def main(args: argparse.Namespace): ...@@ -231,6 +248,10 @@ def main(args: argparse.Namespace):
args.tokenizer, trust_remote_code=args.trust_remote_code) args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None: if args.dataset is None:
# Synthesize a prompt with the given input length. # Synthesize a prompt with the given input length.
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
prompt = "hi" * (args.input_len - 1) prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len) requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)] for _ in range(args.num_prompts)]
...@@ -240,7 +261,7 @@ def main(args: argparse.Namespace): ...@@ -240,7 +261,7 @@ def main(args: argparse.Namespace):
if args.backend == "vllm": if args.backend == "vllm":
elapsed_time = run_vllm( elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization, warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len, args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype, args.enforce_eager, args.kv_cache_dtype,
...@@ -314,10 +335,10 @@ if __name__ == "__main__": ...@@ -314,10 +335,10 @@ if __name__ == "__main__":
default=1, default=1,
help="Number of generated sequences per prompt.") help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true") parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument('--num-iters-warmup', # parser.add_argument('--num-iters-warmup',
type=int, # type=int,
default=1, # default=1,
help='Number of iterations to run for warmup.') # help='Number of iterations to run for warmup.')
parser.add_argument("--num-prompts", parser.add_argument("--num-prompts",
type=int, type=int,
default=1000, default=1000,
......
...@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute( ...@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) # @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96]) @pytest.mark.parametrize("max_tokens", [96])
def test_preemption( def test_preemption(
caplog_vllm, caplog_vllm,
...@@ -118,7 +119,8 @@ def test_preemption( ...@@ -118,7 +119,8 @@ def test_preemption(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) # @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96]) @pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4]) @pytest.mark.parametrize("beam_width", [4])
def test_swap( def test_swap(
...@@ -176,7 +178,8 @@ def test_swap( ...@@ -176,7 +178,8 @@ def test_swap(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) # @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96]) @pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4]) @pytest.mark.parametrize("beam_width", [4])
def test_swap_infeasible( def test_swap_infeasible(
...@@ -220,7 +223,8 @@ def test_swap_infeasible( ...@@ -220,7 +223,8 @@ def test_swap_infeasible(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) # @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96]) @pytest.mark.parametrize("max_tokens", [96])
def test_preemption_infeasible( def test_preemption_infeasible(
vllm_runner, vllm_runner,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment