[TPU] Set per-rank XLA cache (#7533)

951fdd66 · Woosuk Kwon · GitHub · 2ecf7b17 · 951fdd66
Unverified Commit 951fdd66 authored Aug 14, 2024 by Woosuk Kwon Committed by GitHub Aug 14, 2024
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

vllm/worker/tpu_worker.py vllm/worker/tpu_worker.py +6 -6

No files found.
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -102,12 +102,12 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # 30-40 graphs for decode. 128 is an arbitrary safe number.
        torch._dynamo.config.cache_size_limit = 128
        # Use persistent cache to avoid XLA recompilation.
-        # NOTE(woosuk): This does not completely eliminate the recompilation
-        # overhead because dynamo does not cache the compiled results.
-        # NOTE(woosuk): Set readonly=False only for the rank 0 process to avoid
-        # race conditions.
-        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH,
-                            readonly=not self.is_driver_worker)
+        # NOTE(woosuk): Set per-rank cache path since different ranks
+        # can have slightly different XLA graphs.
+        world_size = self.parallel_config.world_size
+        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                     f"tp{world_size}_rank{self.rank}")
+        xr.initialize_cache(per_rank_path, readonly=False)

    def load_model(self):
        self.model_runner.load_model()