Commit e85207b7 authored by zhuwenwen's avatar zhuwenwen
Browse files

[fix]解决v1 deepseek cudagraph模式显存占用增长

parent b909d6fc
......@@ -11,6 +11,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator
from vllm.compilation.counter import compilation_counter
from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
from vllm.forward_context import get_profilling
from vllm.config import CompilationLevel, VllmConfig
from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors
......@@ -202,7 +203,7 @@ def _support_torch_compile(
# torch.compiler.is_compiling() means we are inside the compilation
# e.g. TPU has the compilation logic in model runner, so we don't
# need to compile the model inside.
if self.do_not_compile or torch.compiler.is_compiling():
if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
return self.forward(*args, **kwargs)
# the first compilation needs to have dynamic shapes marked
......
......@@ -196,3 +196,16 @@ def set_forward_context(
_forward_context = prev_context
if envs.VLLM_ENABLE_TBO:
set_tbo_forward_context(_forward_context)
_profiling: bool = False
@contextmanager
def set_profilling(profiling):
global _profiling
_profiling = profiling
def get_profilling() -> bool:
global _profiling
return _profiling
\ No newline at end of file
......@@ -28,7 +28,7 @@ from vllm.distributed.parallel_state import (
get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
prepare_communication_buffer_for_model)
from vllm.forward_context import (DPMetadata, get_forward_context,
set_forward_context)
set_forward_context, set_profilling)
from vllm.logger import init_logger
from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
......@@ -2225,7 +2225,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
else:
hidden_states = outputs
if self.speculative_config and self.speculative_config.use_eagle():
if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
assert isinstance(self.drafter, EagleProposer)
self.drafter.dummy_run(num_tokens, attn_metadata)
......@@ -2385,6 +2385,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
return self._dummy_pooler_run_task(hidden_states, max_task)
def profile_run(self) -> None:
# set profiling flag to avoid torch compile
set_profilling(True)
self._sync_device()
# Profile with multimodal encoder & encoder cache.
# TODO: handle encoder-decoder models once we support them.
if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
......@@ -2470,6 +2474,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
del hidden_states, output
self.encoder_cache.clear()
gc.collect()
set_profilling(False)
def capture_model(self) -> None:
if not self.use_cuda_graph:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment