Fixes assertion failure in prefix caching: the lora index mapping should respect prefix_len (#2688)

Signed-off-by: Tao He <sighingnow@gmail.com>

Fixes assertion failure in prefix caching: the lora index mapping should respect prefix_len (#2688)
Signed-off-by: Tao He <sighingnow@gmail.com>
d69ff0cb · Tao He · GitHub · 1af090b5 · d69ff0cb
Unverified Commit d69ff0cb authored Feb 01, 2024 by Tao He Committed by GitHub Jan 31, 2024
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/worker/model_runner.py vllm/worker/model_runner.py +2 -2

No files found.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -142,10 +142,10 @@ class ModelRunner:
            if lora_id > 0:
                lora_requests.add(seq_group_metadata.lora_request)

-            lora_index_mapping.append([lora_id] * prompt_len)
+            lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
            lora_prompt_mapping.extend(
                [lora_id] *
-                (prompt_len
+                (prompt_len - prefix_len
                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))

            if seq_group_metadata.block_tables is None: