Unverified Commit 6d87a283 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Config] Remove Unused Environment Variable `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` (#26743)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent e6cdbd67
...@@ -198,7 +198,6 @@ if TYPE_CHECKING: ...@@ -198,7 +198,6 @@ if TYPE_CHECKING:
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
VLLM_TUNED_CONFIG_FOLDER: str | None = None VLLM_TUNED_CONFIG_FOLDER: str | None = None
VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
...@@ -1304,12 +1303,6 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1304,12 +1303,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool( "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(
int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0")) int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))
), ),
# Disable padding to CUDA graph capture batch sizes.
# TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
# After the issue is fixed, we can remove this flag.
"VLLM_DISABLE_PAD_FOR_CUDAGRAPH": lambda: bool(
int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))
),
# Used to force set up loopback IP # Used to force set up loopback IP
"VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
# Used to set the process name prefix for vLLM processes. # Used to set the process name prefix for vLLM processes.
......
...@@ -2067,7 +2067,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -2067,7 +2067,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
if ( if (
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
and hasattr(self, "cudagraph_batch_sizes") and hasattr(self, "cudagraph_batch_sizes")
and self.cudagraph_batch_sizes and self.cudagraph_batch_sizes
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1] and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment