[Kernel][Misc] register ops to prevent graph breaks (#6917)

Co-authored-by: Sage Moore <sage@neuralmagic.com>

[Kernel][Misc] register ops to prevent graph breaks (#6917)
Co-authored-by: Sage Moore <sage@neuralmagic.com>
73202dbe · bnellnm · GitHub · 7015417f · 73202dbe · 73202dbe
Unverified Commit 73202dbe authored Sep 11, 2024 by bnellnm Committed by GitHub Sep 11, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

vllm/worker/model_runner.py vllm/worker/model_runner.py +8 -3

vllm/worker/worker.py vllm/worker/worker.py +1 -0

No files found.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -75,6 +75,10 @@ _NUM_WARMUP_ITERS = 2

 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")

+# For now, bump up cache limits for recompilations during CUDA graph warmups.
+torch._dynamo.config.cache_size_limit = 128
+torch._dynamo.config.accumulated_cache_size_limit = 128
+

 @dataclass(frozen=True)
 class ModelInputForGPU(ModelRunnerInputBase):
@@ -1060,9 +1064,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
                    "This may lead to less accurate results!")

        if envs.VLLM_TEST_DYNAMO_GRAPH_CAPTURE and supports_dynamo():
-            self.model = torch.compile(self.model,
-                                       fullgraph=True,
-                                       backend="eager")
+            self.model = torch.compile(
+                self.model,
+                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
+                backend="eager")

    def save_sharded_state(
        self,

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -166,6 +166,7 @@ class Worker(LocalOrDistributedWorkerBase):
            torch.cuda.set_device(self.device)

            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
            torch.cuda.empty_cache()
            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
        else: