[fix]解决v1 deepseek cudagraph模式显存占用增长

e85207b7 · zhuwenwen · b909d6fc · e85207b7 · e85207b7 · e85207b7
Commit e85207b7 authored Aug 18, 2025 by zhuwenwen
Showing with 22 additions and 3 deletions

vllm/compilation/decorators.py vllm/compilation/decorators.py +2 -1

vllm/forward_context.py vllm/forward_context.py +13 -0

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +7 -2

No files found.
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -11,6 +11,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator

 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.forward_context import get_profilling
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
@@ -202,7 +203,7 @@ def _support_torch_compile(
        # torch.compiler.is_compiling() means we are inside the compilation
        # e.g. TPU has the compilation logic in model runner, so we don't
        # need to compile the model inside.
-        if self.do_not_compile or torch.compiler.is_compiling():
+        if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
            return self.forward(*args, **kwargs)

        # the first compilation needs to have dynamic shapes marked

--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -196,3 +196,16 @@ def set_forward_context(
        _forward_context = prev_context
        if envs.VLLM_ENABLE_TBO:
            set_tbo_forward_context(_forward_context)
+
+
+_profiling: bool = False
+
+@contextmanager
+def set_profilling(profiling):
+    global _profiling
+    _profiling = profiling
+
+
+def get_profilling() -> bool:
+    global _profiling
+    return _profiling
\ No newline at end of file
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,7 +28,7 @@ from vllm.distributed.parallel_state import (
    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
    prepare_communication_buffer_for_model)
 from vllm.forward_context import (DPMetadata, get_forward_context,
-                                  set_forward_context)
+                                  set_forward_context, set_profilling)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -2225,7 +2225,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            else:
                hidden_states = outputs

-            if self.speculative_config and self.speculative_config.use_eagle():
+            if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
                assert isinstance(self.drafter, EagleProposer)
                self.drafter.dummy_run(num_tokens, attn_metadata)

@@ -2385,6 +2385,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        return self._dummy_pooler_run_task(hidden_states, max_task)

    def profile_run(self) -> None:
+        # set profiling flag to avoid torch compile
+        set_profilling(True)
+        self._sync_device()
+
        # Profile with multimodal encoder & encoder cache.
        # TODO: handle encoder-decoder models once we support them.
        if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
@@ -2470,6 +2474,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        del hidden_states, output
        self.encoder_cache.clear()
        gc.collect()
+        set_profilling(False)

    def capture_model(self) -> None:
        if not self.use_cuda_graph: