update tests of kernels and basic_correctness

c012f7f6 · zhuwenwen · 55c5f16f · c012f7f6 · c012f7f6 · c012f7f6
Commit c012f7f6 authored Nov 10, 2024 by zhuwenwen
4 changed files
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -36,7 +36,8 @@ def test_vllm_gc_ed():


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False, True])

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -121,71 +121,71 @@ def test_models_distributed(
    )


-@pytest.mark.parametrize(
-    "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-def test_models_with_fp8_kv_cache(
-    vllm_runner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Check output logprobs match between no_chunked_prefill and chunked_prefill
-    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
-    so here we only check chunked prefill.
-    """
-    NUM_LOG_PROBS = 8
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
+# @pytest.mark.parametrize(
+#     "kv_cache_dtype,model",
+#     [("fp8_e4m3",
+#       "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
+# # Due to low-precision numerical divergence, we only test logprob of 4 tokens
+# @pytest.mark.parametrize("max_tokens", [4])
+# @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+# @pytest.mark.parametrize("enforce_eager", [False, True])
+# # NOTE: Increasing this in this suite will fail CI because we currently cannot
+# # reset distributed env properly. Use a value > 1 just when you test.
+# @pytest.mark.parametrize("tensor_parallel_size", [1])
+# # Due to low-precision numerical divergence, this test is too sensitive to
+# # the async postprocessor
+# @pytest.mark.parametrize("disable_async_output_proc", [True])
+# def test_models_with_fp8_kv_cache(
+#     vllm_runner,
+#     example_prompts,
+#     kv_cache_dtype: str,
+#     model: str,
+#     max_tokens: int,
+#     chunked_prefill_token_size: int,
+#     enforce_eager: bool,
+#     tensor_parallel_size: int,
+#     disable_async_output_proc: bool,
+# ) -> None:
+#     """
+#     Check output logprobs match between no_chunked_prefill and chunked_prefill
+#     with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+#     so here we only check chunked prefill.
+#     """
+#     NUM_LOG_PROBS = 8
+
+#     max_num_seqs = chunked_prefill_token_size
+#     max_num_batched_tokens = chunked_prefill_token_size
+
+#     with vllm_runner(
+#             model,
+#             tensor_parallel_size=tensor_parallel_size,
+#             enforce_eager=enforce_eager,
+#             max_num_seqs=max_num_seqs,
+#             kv_cache_dtype=kv_cache_dtype,
+#             disable_async_output_proc=disable_async_output_proc,
+#     ) as vllm_model:
+#         no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+#             example_prompts, max_tokens, NUM_LOG_PROBS)
+
+#     with vllm_runner(
+#             model,
+#             max_num_batched_tokens=max_num_batched_tokens,
+#             enable_chunked_prefill=True,
+#             tensor_parallel_size=tensor_parallel_size,
+#             enforce_eager=enforce_eager,
+#             max_num_seqs=max_num_seqs,
+#             kv_cache_dtype=kv_cache_dtype,
+#             disable_async_output_proc=disable_async_output_proc,
+#     ) as vllm_model:
+#         chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+#             example_prompts, max_tokens, NUM_LOG_PROBS)
+
+#     check_logprobs_close(
+#         outputs_0_lst=no_chunked_prefill_outputs,
+#         outputs_1_lst=chunked_prefill_outputs,
+#         name_0="no_chunked_prefill",
+#         name_1="chunked_prefill",
+#     )


 @pytest.mark.parametrize("max_tokens", [16])

--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -115,7 +115,8 @@ def ref_single_query_cached_kv_attention(


 @pytest.mark.parametrize(
-    "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"])
+    # "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"])
+    "version", ["v1", "v2"])
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)

--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -8,9 +8,12 @@ from vllm.attention.selector import which_attn_to_use
 from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL


+# @pytest.mark.parametrize(
+#     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+# @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
-@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_env(name: str, device: str, monkeypatch):
    """Test that the attention selector can be set via environment variable.
    Note that we do not test FlashAttn because it is the default backend.