Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d6ab5289
Unverified
Commit
d6ab5289
authored
Jul 11, 2024
by
Lily Liu
Committed by
GitHub
Jul 12, 2024
Browse files
[Misc] Remove flashinfer warning, add flashinfer tests to CI (#6351)
parent
7ed6a4f0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
6 deletions
+10
-6
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+5
-3
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+5
-0
vllm/attention/selector.py
vllm/attention/selector.py
+0
-3
No files found.
.buildkite/test-pipeline.yaml
View file @
d6ab5289
...
...
@@ -19,8 +19,10 @@ steps:
-
label
:
Basic Correctness Test
mirror_hardwares
:
[
amd
]
commands
:
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASHINFER pytest -v -s basic_correctness/test_basic_correctness.py
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
...
...
@@ -122,14 +124,14 @@ steps:
-
label
:
Kernels Test %N
#mirror_hardwares: [amd]
commands
:
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
7
/flashinfer-0.0.
7
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
8
/flashinfer-0.0.
8
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
4
-
label
:
Models Test
#mirror_hardwares: [amd]
commands
:
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
7
/flashinfer-0.0.
7
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
8
/flashinfer-0.0.
8
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pytest -v -s models -m \"not vlm\"
-
label
:
Vision Language Models Test
...
...
@@ -240,7 +242,7 @@ steps:
-
pytest -v -s distributed/test_custom_all_reduce.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
7
/flashinfer-0.0.
7
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.
8
/flashinfer-0.0.
8
+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s -x lora/test_mixtral.py
tests/basic_correctness/test_basic_correctness.py
View file @
d6ab5289
...
...
@@ -2,11 +2,13 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
import
os
import
weakref
import
pytest
from
vllm
import
LLM
from
vllm.utils
import
is_hip
from
..models.utils
import
check_outputs_equal
...
...
@@ -26,6 +28,9 @@ def test_vllm_gc_ed():
assert
weak_llm
()
is
None
@
pytest
.
mark
.
skipif
(
is_hip
()
and
os
.
getenv
(
"VLLM_ATTENTION_BACKEND"
)
==
"FLASHINFER"
,
reason
=
"Flashinfer does not support ROCm/HIP."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
...
...
vllm/attention/selector.py
View file @
d6ab5289
...
...
@@ -77,9 +77,6 @@ def get_attn_backend(
return
IpexAttnBackend
elif
backend
==
_Backend
.
FLASHINFER
:
logger
.
info
(
"Using Flashinfer backend."
)
logger
.
warning
((
"Flashinfer will be stuck on llama-2-7b,"
" please avoid using Flashinfer as the "
"backend when running on llama-2-7b."
))
from
vllm.attention.backends.flashinfer
import
FlashInferBackend
return
FlashInferBackend
elif
backend
==
_Backend
.
PALLAS
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment