Fixing Chunked Prefill Test. (#19762)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>

Fixing Chunked Prefill Test. (#19762)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
47194606 · Alexei-V-Ivanov-AMD · GitHub · 466166dc · 47194606 · 47194606
Unverified Commit 47194606 authored Jun 19, 2025 by Alexei-V-Ivanov-AMD Committed by GitHub Jun 19, 2025
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +1 -1

tests/basic_correctness/test_chunked_prefill.py tests/basic_correctness/test_chunked_prefill.py +16 -2

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -89,7 +89,7 @@ steps:
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -49,7 +49,13 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
 def test_models(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -99,7 +105,13 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", [
+    pytest.param("FLASHINFER",
+                 marks=pytest.mark.skipif(
+                     current_platform.is_rocm(),
+                     reason="FLASHINFER isn't supported on ROCm")),
+    "FLASH_ATTN"
+])
 def test_models_distributed(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -172,6 +184,8 @@ def test_models_distributed(
 # Due to low-precision numerical divergence, this test is too sensitive to
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="machete_prepack_B isn't supported on ROCm")
 def test_models_with_fp8_kv_cache(
    vllm_runner: VllmRunner,
    example_prompts,