[CI] Disable flash_attn backend for spec decode (#5286)

3a6ae1d3 · Simon Mo · GitHub · 8f1729b8 · 3a6ae1d3
Unverified Commit 3a6ae1d3 authored Jun 05, 2024 by Simon Mo Committed by GitHub Jun 05, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +5 -2

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -45,7 +45,7 @@ steps:
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py

 - label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
@@ -124,7 +124,10 @@ steps:

 - label: Speculative decoding tests
  #mirror_hardwares: [amd]
-  command: pytest -v -s spec_decode
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode

 - label: LoRA Test %N
  #mirror_hardwares: [amd]