[XPU] Enable ModelRunnerV2 on XPU (#36078)

Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>

[XPU] Enable ModelRunnerV2 on XPU (#36078)
Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
d8839ef7 · Xinyu Chen · GitHub · e998fa76 · d8839ef7 · d8839ef7
Unverified Commit d8839ef7 authored Mar 06, 2026 by Xinyu Chen Committed by GitHub Mar 05, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 2 deletions

vllm/v1/worker/xpu_model_runner.py vllm/v1/worker/xpu_model_runner.py +18 -0

vllm/v1/worker/xpu_worker.py vllm/v1/worker/xpu_worker.py +3 -2

No files found.
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -8,6 +8,9 @@ import torch
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import supports_xpu_graph
+from vllm.v1.worker.gpu.model_runner import (
+    GPUModelRunner as GPUModelRunnerV2,
+)
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 if TYPE_CHECKING:
@@ -30,6 +33,18 @@ class XPUModelRunner(GPUModelRunner):
        self.cascade_attn_enabled = False
+class XPUModelRunnerV2(GPUModelRunnerV2):
+    """A model runner for XPU devices."""
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
 @contextmanager
 def _torch_cuda_wrapper():
    try:
@@ -39,9 +54,12 @@ def _torch_cuda_wrapper():
        torch.cuda.current_stream = torch.xpu.current_stream
        torch.cuda.stream = torch.xpu.stream
        torch.cuda.mem_get_info = torch.xpu.mem_get_info
+        torch.cuda.Event = torch.Event
+        torch.cuda.set_stream = torch.xpu.set_stream
        if supports_xpu_graph():
            torch.cuda.graph = torch.xpu.graph
            torch.cuda.CUDAGraph = torch.xpu.XPUGraph
+            torch.cuda.graph_pool_handle = torch.xpu.graph_pool_handle
        yield
    finally:
        pass
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -15,7 +15,7 @@ from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 from vllm.v1.worker.workspace import init_workspace_manager
-from vllm.v1.worker.xpu_model_runner import XPUModelRunner
+from vllm.v1.worker.xpu_model_runner import XPUModelRunner, XPUModelRunnerV2
 from .utils import request_memory
@@ -105,7 +105,8 @@ class XPUWorker(Worker):
        init_workspace_manager(self.device, num_ubatches)
        # Construct the model runner
-        self.model_runner = XPUModelRunner(  # type: ignore
+        model_runner = XPUModelRunnerV2 if self.use_v2_model_runner else XPUModelRunner
+        self.model_runner = model_runner(  # type: ignore
            self.vllm_config, self.device
        )