Unverified Commit 61728cd1 authored by Copilot's avatar Copilot Committed by GitHub
Browse files

Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966)


Signed-off-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: default avatarcopilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: default avatarProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
parent 0c80efd9
...@@ -930,6 +930,8 @@ steps: ...@@ -930,6 +930,8 @@ steps:
- csrc/quantization/fp4/ - csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/ - vllm/compilation/
# can affect pattern matching # can affect pattern matching
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
......
...@@ -47,12 +47,8 @@ if current_platform.is_cuda(): ...@@ -47,12 +47,8 @@ if current_platform.is_cuda():
ModelBackendTestCase( ModelBackendTestCase(
# Use smaller model for L40s in CI # Use smaller model for L40s in CI
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
# so FI attention+fp8_quant is at least tested once
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
backend=AttentionBackendEnum.FLASHINFER backend=AttentionBackendEnum.TRITON_ATTN,
if is_blackwell()
else AttentionBackendEnum.TRITON_ATTN,
matches=Matches( matches=Matches(
attention_fusion=32, attention_fusion=32,
allreduce_fusion=65, allreduce_fusion=65,
...@@ -65,9 +61,9 @@ if current_platform.is_cuda(): ...@@ -65,9 +61,9 @@ if current_platform.is_cuda():
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
# TODO FlashInfer attn broken on Hopper with kvcache=fp8: # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
# https://github.com/vllm-project/vllm/issues/28568 # https://github.com/vllm-project/vllm/issues/28568
# TODO FlashInfer attn broken on Blackwell for llama4: backend=AttentionBackendEnum.FLASHINFER
# https://github.com/vllm-project/vllm/issues/28604 if is_blackwell()
backend=AttentionBackendEnum.TRITON_ATTN, else AttentionBackendEnum.TRITON_ATTN,
matches=Matches( matches=Matches(
attention_fusion=48, attention_fusion=48,
allreduce_fusion=96, allreduce_fusion=96,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment