[TPU] support disabling xla compilation cache (#15567)

Signed-off-by: Chengji Yao <chengjiyao@google.com>

[TPU] support disabling xla compilation cache (#15567)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
e74ff409 · Chengji Yao · GitHub · 7a888271 · e74ff409 · e74ff409
Unverified Commit e74ff409 authored Mar 26, 2025 by Chengji Yao Committed by GitHub Mar 27, 2025
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 6 deletions

vllm/v1/worker/tpu_worker.py vllm/v1/worker/tpu_worker.py +10 -3

vllm/worker/tpu_worker.py vllm/worker/tpu_worker.py +10 -3

No files found.
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -113,6 +113,13 @@ class TPUWorker:
        # can have slightly different XLA graphs.
        world_size = self.parallel_config.world_size
        rank = xr.global_ordinal()
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
                                         f"tp{world_size}_rank{rank}")
            xr.initialize_cache(per_rank_path, readonly=False)

--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -93,6 +93,13 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # can have slightly different XLA graphs.
        world_size = self.parallel_config.world_size
        rank = xr.global_ordinal()
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
                                         f"tp{world_size}_rank{rank}")
            xr.initialize_cache(per_rank_path, readonly=False)