Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
Showing with 15 additions and 8 deletions

vllm/worker/tpu_worker.py vllm/worker/tpu_worker.py +10 -3

vllm/worker/worker.py vllm/worker/worker.py +2 -2

vllm/worker/xpu_model_runner.py vllm/worker/xpu_model_runner.py +3 -3

No files found.
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -93,9 +93,16 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # can have slightly different XLA graphs.
        world_size = self.parallel_config.world_size
        rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
-                                     f"tp{world_size}_rank{rank}")
+        # Consequently, changes in optimization flags, which affect compilation
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
        self.profiler = None
        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -135,9 +135,9 @@ class Worker(LocalOrDistributedWorkerBase):
            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
            used_bytes / GiB_bytes)
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
        allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags=tags)
    def init_device(self) -> None:
        if self.device_config.device.type == "cuda":

--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                             MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
+from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
 from vllm.worker.model_runner_base import (
    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
            self.model = get_model(vllm_config=self.vllm_config)
        self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
+        logger.info("Loading model weights took %.4f GiB",
-                    self.model_memory_usage / float(2**30))
+                    self.model_memory_usage / GiB_bytes)
    def get_model(self) -> nn.Module:
        return self.model