[Misc] Remove deprecated arg for cuda graph capture (#9864)

Signed-off-by: Roger Wang <ywang@roblox.com>

[Misc] Remove deprecated arg for cuda graph capture (#9864)
Signed-off-by: Roger Wang <ywang@roblox.com>
3ea2dc2e · Roger Wang · GitHub · d087bf86 · 3ea2dc2e · 3ea2dc2e
Unverified Commit 3ea2dc2e authored Oct 31, 2024 by Roger Wang Committed by GitHub Oct 31, 2024
Showing with 1 addition and 23 deletions

vllm/config.py vllm/config.py +0 -7

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +0 -10

vllm/entrypoints/llm.py vllm/entrypoints/llm.py +0 -5

vllm/worker/model_runner.py vllm/worker/model_runner.py +1 -1

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -84,9 +84,6 @@ class ModelConfig:
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
            If None, the user did not specify, so default to False.
-        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
-            When a sequence has context length larger than this, we fall back
-            to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
            When a sequence has context length larger than this, we fall back
            to eager mode. Additionally for encoder-decoder models, if the
@@ -147,7 +144,6 @@ class ModelConfig:
            quantization: Optional[str] = None,
            quantization_param_path: Optional[str] = None,
            enforce_eager: Optional[bool] = None,
-            max_context_len_to_capture: Optional[int] = None,
            max_seq_len_to_capture: Optional[int] = None,
            max_logprobs: int = 20,
            disable_sliding_window: bool = False,
@@ -181,9 +177,6 @@ class ModelConfig:
        self.quantization = quantization
        self.quantization_param_path = quantization_param_path
        self.enforce_eager = enforce_eager
-        if max_context_len_to_capture is not None:
-            raise ValueError("`max_context_len_to_capture` is deprecated. "
-                             "Use `max_seq_len_to_capture` instead.")
        self.max_seq_len_to_capture = max_seq_len_to_capture
        self.max_logprobs = max_logprobs
        self.disable_sliding_window = disable_sliding_window

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -126,7 +126,6 @@ class EngineArgs:
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
    enforce_eager: Optional[bool] = None
-    max_context_len_to_capture: Optional[int] = None
    max_seq_len_to_capture: int = 8192
    disable_custom_all_reduce: bool = False
    tokenizer_pool_size: int = 0
@@ -504,14 +503,6 @@ class EngineArgs:
                            help='Always use eager-mode PyTorch. If False, '
                            'will use eager mode and CUDA graph in hybrid '
                            'for maximal performance and flexibility.')
-        parser.add_argument('--max-context-len-to-capture',
-                            type=int,
-                            default=EngineArgs.max_context_len_to_capture,
-                            help='Maximum context length covered by CUDA '
-                            'graphs. When a sequence has context length '
-                            'larger than this, we fall back to eager mode. '
-                            '(DEPRECATED. Use --max-seq-len-to-capture instead'
-                            ')')
        parser.add_argument('--max-seq-len-to-capture',
                            type=int,
                            default=EngineArgs.max_seq_len_to_capture,
@@ -939,7 +930,6 @@ class EngineArgs:
            quantization=self.quantization,
            quantization_param_path=self.quantization_param_path,
            enforce_eager=self.enforce_eager,
-            max_context_len_to_capture=self.max_context_len_to_capture,
            max_seq_len_to_capture=self.max_seq_len_to_capture,
            max_logprobs=self.max_logprobs,
            disable_sliding_window=self.disable_sliding_window,

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -93,9 +93,6 @@ class LLM:
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
-        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
-            When a sequence has context length larger than this, we fall back
-            to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
            When a sequence has context length larger than this, we fall back
            to eager mode. Additionally for encoder-decoder models, if the
@@ -152,7 +149,6 @@ class LLM:
        swap_space: float = 4,
        cpu_offload_gb: float = 0,
        enforce_eager: Optional[bool] = None,
-        max_context_len_to_capture: Optional[int] = None,
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        disable_async_output_proc: bool = False,
@@ -193,7 +189,6 @@ class LLM:
            swap_space=swap_space,
            cpu_offload_gb=cpu_offload_gb,
            enforce_eager=enforce_eager,
-            max_context_len_to_capture=max_context_len_to_capture,
            max_seq_len_to_capture=max_seq_len_to_capture,
            disable_custom_all_reduce=disable_custom_all_reduce,
            disable_async_output_proc=disable_async_output_proc,

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -995,7 +995,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
        # Python can be expensive. To optimize this, we cache the block table
        # in numpy and only copy the actual input content at every iteration.
        # The shape of the cached block table will be
-        # (max batch size to capture, max context len to capture / block size).
+        # (max batch size to capture, max seq len to capture / block size).
        self.graph_block_tables = np.zeros(
            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
            dtype=np.int32)