[Bugfix] Fix chunked prefill for GGUF (#14666)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>

[Bugfix] Fix chunked prefill for GGUF (#14666)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
55211b01 · Szymon Ożóg · GitHub · 5d043c16 · 55211b01
Unverified Commit 55211b01 authored Mar 13, 2025 by Szymon Ożóg Committed by GitHub Mar 13, 2025
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 0 deletions

vllm/model_executor/layers/quantization/gguf.py vllm/model_executor/layers/quantization/gguf.py +7 -0

No files found.
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -98,6 +98,13 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                  qweight_type: int) -> torch.Tensor:
+    # HACK: when doing chunked prefill we don't generate output tokens
+    # so input to logits generator is empty which causes invalid parameter
+    if x.shape[0] == 0:
+        return torch.empty(x.shape[0],
+                           qweight.shape[0],
+                           dtype=x.dtype,
+                           device=x.device)
    # there is no need to call any kernel for fp16/bf16
    if qweight_type in UNQUANTIZED_TYPES:
        return x @ qweight.T