[v1] reduce graph capture time for piecewise cudagraph (#10059)

Signed-off-by: youkaichao <youkaichao@gmail.com>

[v1] reduce graph capture time for piecewise cudagraph (#10059)
Signed-off-by: youkaichao <youkaichao@gmail.com>
c4cacbaa · youkaichao · GitHub · 0c63c34f · c4cacbaa
Unverified Commit c4cacbaa authored Nov 05, 2024 by youkaichao Committed by GitHub Nov 05, 2024
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 11 deletions

vllm/compilation/backends.py vllm/compilation/backends.py +25 -11

No files found.
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
 import copy
 import dataclasses
 import operator
+from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch

 import torch
 import torch.fx as fx
@@ -503,6 +505,18 @@ class PiecewiseBackend:
            entry.input_addresses = input_addresses
            cudagraph = torch.cuda.CUDAGraph()

+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
                # mind-exploding: carefully manage the reference and memory.
                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
                    # `output` is managed by pytorch's cudagraph pool