[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (#8403)

8a23e933 · WANGWEI · GitHub · c6202dae · 8a23e933
Unverified Commit 8a23e933 authored Sep 13, 2024 by WANGWEI Committed by GitHub Sep 12, 2024
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

vllm/worker/multi_step_model_runner.py vllm/worker/multi_step_model_runner.py +5 -2

No files found.
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -230,12 +230,15 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
        self._base_model_runner: GPUModelRunnerBase = base_model_runner

        self.is_multi_step = self.scheduler_config.is_multi_step
-        # used to copy tensors from GPU to CPU asynchronously
-        self._copy_stream = torch.cuda.Stream()
        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None

        self.pythonization_cache = PythonizationCache()

+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
    def make_model_input_from_broadcasted_tensor_dict(
            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(