[bug fix] Fix llama4 spec decoding (#22691)

Signed-off-by: qizixi <qizixi@meta.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>

[bug fix] Fix llama4 spec decoding (#22691)
Signed-off-by: qizixi <qizixi@meta.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
5bfe0dea · qizixi · GitHub · 31fd3265 · 5bfe0dea
Unverified Commit 5bfe0dea authored Aug 19, 2025 by qizixi Committed by GitHub Aug 19, 2025
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

vllm/model_executor/models/llama4.py vllm/model_executor/models/llama4.py +4 -2

No files found.
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -195,7 +195,9 @@ class Llama4Attention(nn.Module):
            is_neox_style=is_neox_style,
        ) if not self.nope else None

-        attn_cls = Attention if self.nope else ChunkedLocalAttention
+        use_chunked_local_attn = not self.nope and config.attention_chunk_size
+        attn_cls = (ChunkedLocalAttention
+                    if use_chunked_local_attn else Attention)
        self.attn = attn_cls(
            self.num_heads,
            self.head_dim,
@@ -206,7 +208,7 @@ class Llama4Attention(nn.Module):
            prefix=f"{prefix}.attn",
            **({
                "attention_chunk_size": config.attention_chunk_size
-            } if not self.nope else {}))
+            } if use_chunked_local_attn else {}))

    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
        floor = torch.floor((positions + 1.0) / self.floor_scale)