[Speculative Decoding] Move indices to device before filtering output (#10850)

Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>

[Speculative Decoding] Move indices to device before filtering output (#10850)
Co-authored-by: Yang Zheng(SW)(Alex) <you@example.com>
f6084f63 · Yang Zheng · GitHub · 9323a315 · f6084f63
Unverified Commit f6084f63 authored Dec 03, 2024 by Yang Zheng Committed by GitHub Dec 03, 2024
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

vllm/spec_decode/multi_step_worker.py vllm/spec_decode/multi_step_worker.py +6 -3

No files found.
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -120,6 +120,9 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
                    indices_of_seq_with_bonus_tokens)
                model_outputs.append(model_output)

+        # move indices to device to avoid stream sync
+        indices_of_seq_with_bonus_tokens = torch.tensor(
+            indices_of_seq_with_bonus_tokens, device=self.device)
        filtered_model_outputs = self._filter_model_output(
            model_outputs, indices_of_seq_with_bonus_tokens)
        return filtered_model_outputs, True
@@ -189,7 +192,7 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
    @staticmethod
    def _filter_model_output(
            expanded_batch_outputs: List[SamplerOutput],
-            output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+            output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]:
        """
        Filters the model output to include only the specified sequence
        outputs. This method contracts the expanded batch output from the
@@ -199,8 +202,8 @@ class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase):
        Args:
            expanded_batch_output (List[SamplerOutput]): The expanded output
                batch from the model.
-            output_indices_to_retain (List[int]): Indices of the model outputs
-                to retain.
+            output_indices_to_retain (torch.Tensor): Indices of the model
+                outputs to retain.

        Returns:
            List[SamplerOutput]: A list containing the filtered model