fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to avoid hidden...

fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to avoid hidden performance regressions (#25738) Signed-off-by: Andrew Sansom <andrew@protopia.ai>

fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to avoid hidden...
fix: revert cast to cpu in `MsgpackEncoder._encode_tensor` to avoid hidden performance regressions (#25738) Signed-off-by: Andrew Sansom <andrew@protopia.ai>
e84e0735 · Andrew Sansom · GitHub · 3edf87d2 · e84e0735 · e84e0735
Unverified Commit e84e0735 authored Sep 26, 2025 by Andrew Sansom Committed by GitHub Sep 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

vllm/inputs/preprocess.py vllm/inputs/preprocess.py +5 -0

vllm/v1/serial_utils.py vllm/v1/serial_utils.py +1 -1

No files found.
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -278,6 +278,11 @@ class InputPreprocessor:
            raise ValueError(
                "prompt_embeds must be of shape (seq_len, hidden_size).")
+        # Tensors must be on CPU for serialization between processes
+        # in the MsgpackEncoder. Casting to CPU here ensures that there is no
+        # hidden device transfer in the critical path of generation.
+        prompt_embeds = prompt_embeds.cpu()
        return embeds_inputs(prompt_embeds=prompt_embeds,
                             cache_salt=parsed_content.get("cache_salt"))

--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -208,7 +208,7 @@ class MsgpackEncoder:
    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
        assert self.aux_buffers is not None
        # view the tensor as a contiguous 1D array of bytes
-        arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
+        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
        if obj.nbytes < self.size_threshold:
            # Smaller tensors are encoded inline, just like ndarrays.
            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)