[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>

[V1] Multiprocessing Tensor Parallel Support for v1 (#9856)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
28b3a1c7 · Tyler Michael Smith · GitHub · bc192a2b · 28b3a1c7
Unverified Commit 28b3a1c7 authored Dec 10, 2024 by Tyler Michael Smith Committed by GitHub Dec 10, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 2 deletions

vllm/v1/worker/gpu_worker.py vllm/v1/worker/gpu_worker.py +9 -2

No files found.
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -15,6 +15,7 @@ from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size
+from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -56,7 +57,6 @@ class Worker:
            from vllm.utils import init_cached_hf_modules
            init_cached_hf_modules()
-        self.model_runner = GPUModelRunner(vllm_config)
        # Torch profiler. Enabled and configured through env vars:
        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
        if envs.VLLM_TORCH_PROFILER_DIR:
@@ -103,6 +103,9 @@ class Worker:
        # Set random seed.
        set_random_seed(self.model_config.seed)
+        # Construct the model runner
+        self.model_runner = GPUModelRunner(self.vllm_config, self.device)
    def load_model(self) -> None:
        self.model_runner.load_model()
@@ -198,7 +201,7 @@ class Worker:
        scheduler_output: "SchedulerOutput",
    ) -> ModelRunnerOutput:
        output = self.model_runner.execute_model(scheduler_output)
-        # TODO(woosuk): Send the output to the engine process.
+        return output if self.rank == 0 else None
        return output
    def profile(self, is_start=True):
@@ -209,6 +212,10 @@ class Worker:
        else:
            self.profiler.stop()
+    def check_health(self) -> None:
+        # worker will always be healthy as long as it's running.
+        return
 def init_worker_distributed_environment(
    parallel_config: ParallelConfig,