Commit bdac8f06 authored by zhuwenwen's avatar zhuwenwen
Browse files

update benchmraks and tests

parent ffbef65c
......@@ -62,6 +62,7 @@ def sample_requests(
def run_vllm(
warmup_requests: List[Tuple[str, int, int]],
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
......@@ -122,21 +123,37 @@ def run_vllm(
))
# warmup
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.num_prompts,
args.input_len))
dummy_inputs: List[PromptStrictInputs] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion():
llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
warmup_prompts = []
warmup_sampling_params = []
for prompt, _, output_len in warmup_requests:
warmup_prompts.append(prompt)
warmup_sampling_params.append(
SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
max_tokens=output_len,
))
print("Warming up...")
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
run_to_completion()
llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
# dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts,
# args.input_len))
# dummy_inputs: List[PromptStrictInputs] = [{
# "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()]
# def run_to_completion():
# llm.generate(dummy_inputs,
# sampling_params=sampling_params,
# use_tqdm=False)
# print("Warming up...")
# for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
# run_to_completion()
start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True)
......@@ -231,6 +248,10 @@ def main(args: argparse.Namespace):
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
# Synthesize a prompt with the given input length.
warmup_prompt = "hi" * 10
warmup_requests = [(warmup_prompt, 10, 10)
for _ in range(1)]
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
......@@ -240,7 +261,7 @@ def main(args: argparse.Namespace):
if args.backend == "vllm":
elapsed_time = run_vllm(
requests, args.model, args.tokenizer, args.quantization,
warmup_requests, requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
......@@ -314,10 +335,10 @@ if __name__ == "__main__":
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument('--num-iters-warmup',
type=int,
default=1,
help='Number of iterations to run for warmup.')
# parser.add_argument('--num-iters-warmup',
# type=int,
# default=1,
# help='Number of iterations to run for warmup.')
parser.add_argument("--num-prompts",
type=int,
default=1000,
......
......@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
def test_preemption(
caplog_vllm,
......@@ -118,7 +119,8 @@ def test_preemption(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4])
def test_swap(
......@@ -176,7 +178,8 @@ def test_swap(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4])
def test_swap_infeasible(
......@@ -220,7 +223,8 @@ def test_swap_infeasible(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
# @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
def test_preemption_infeasible(
vllm_runner,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment