update benchmraks and tests

bdac8f06 · zhuwenwen · ffbef65c · bdac8f06 · bdac8f06
Commit bdac8f06 authored Aug 03, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 23 deletions

benchmarks/benchmark_throughput.py benchmarks/benchmark_throughput.py +40 -19

tests/basic_correctness/test_preemption.py tests/basic_correctness/test_preemption.py +8 -4

No files found.
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -62,6 +62,7 @@ def sample_requests(
 def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: str,
@@ -122,21 +123,37 @@ def run_vllm(
            ))
    # warmup
-    dummy_prompt_token_ids = np.random.randint(10000,
+    warmup_prompts = []
-                                               size=(args.num_prompts,
+    warmup_sampling_params = []
-                                                     args.input_len))
+    for prompt, _, output_len in warmup_requests:
-    dummy_inputs: List[PromptStrictInputs] = [{
+        warmup_prompts.append(prompt)
-        "prompt_token_ids": batch
+        warmup_sampling_params.append(
-    } for batch in dummy_prompt_token_ids.tolist()]
+            SamplingParams(
+                n=n,
-    def run_to_completion():
+                temperature=0.0 if use_beam_search else 1.0,
-        llm.generate(dummy_inputs,
+                top_p=1.0,
-                        sampling_params=sampling_params,
+                use_beam_search=use_beam_search,
-                        use_tqdm=False)
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
-        run_to_completion()
+    # dummy_prompt_token_ids = np.random.randint(10000,
+    #                                            size=(args.num_prompts,
+    #                                                  args.input_len))
+    # dummy_inputs: List[PromptStrictInputs] = [{
+    #     "prompt_token_ids": batch
+    # } for batch in dummy_prompt_token_ids.tolist()]
+    # def run_to_completion():
+    #     llm.generate(dummy_inputs,
+    #                     sampling_params=sampling_params,
+    #                     use_tqdm=False)
+    # print("Warming up...")
+    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    #     run_to_completion()
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, use_tqdm=True)
@@ -231,6 +248,10 @@ def main(args: argparse.Namespace):
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
+        warmup_prompt = "hi" * 10
+        warmup_requests = [(warmup_prompt, 10, 10)
+                    for _ in range(1)]
        prompt = "hi" * (args.input_len - 1)
        requests = [(prompt, args.input_len, args.output_len)
                    for _ in range(args.num_prompts)]
@@ -240,7 +261,7 @@ def main(args: argparse.Namespace):
    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
@@ -314,10 +335,10 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument('--num-iters-warmup',
+    # parser.add_argument('--num-iters-warmup',
-                        type=int,
+    #                     type=int,
-                        default=1,
+    #                     default=1,
-                        help='Number of iterations to run for warmup.')
+    #                     help='Number of iterations to run for warmup.')
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -67,7 +67,8 @@ def test_chunked_prefill_recompute(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption(
    caplog_vllm,
@@ -118,7 +119,8 @@ def test_preemption(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 @pytest.mark.parametrize("beam_width", [4])
 def test_swap(
@@ -176,7 +178,8 @@ def test_swap(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 @pytest.mark.parametrize("beam_width", [4])
 def test_swap_infeasible(
@@ -220,7 +223,8 @@ def test_swap_infeasible(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption_infeasible(
    vllm_runner,