Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6437ff1f
Unverified
Commit
6437ff1f
authored
Jan 21, 2026
by
Wentao Ye
Committed by
GitHub
Jan 22, 2026
Browse files
[Deprecation] Remove deprecated environment variables (#32812)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
5e00b561
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
7 additions
and
118 deletions
+7
-118
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+2
-2
tests/v1/spec_decode/test_acceptance_length.py
tests/v1/spec_decode/test_acceptance_length.py
+1
-1
vllm/config/attention.py
vllm/config/attention.py
+0
-46
vllm/envs.py
vllm/envs.py
+0
-67
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+4
-1
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+0
-1
No files found.
.buildkite/test-amd.yaml
View file @
6437ff1f
...
...
@@ -1473,7 +1473,7 @@ steps:
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-
VLLM_ATTENTION_BACKEND=ROCM_ATTN
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
--attention-backend ROCM_ATTN
-
label
:
DP EP NixlConnector PD accuracy tests (Distributed)
# 15min
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
@@ -1487,7 +1487,7 @@ steps:
-
tests/v1/kv_connector/nixl_integration/
commands
:
-
uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
-
VLLM_ATTENTION_BACKEND=ROCM_ATTN
DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
-
DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
--attention-backend ROCM_ATTN
##### multi gpus test #####
##### A100 test #####
...
...
tests/v1/spec_decode/test_acceptance_length.py
View file @
6437ff1f
...
...
@@ -207,7 +207,6 @@ def test_eagle3_acceptance_length(
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attention_backend
)
with
VllmRunner
(
model_name
=
model_config
.
verifier
,
...
...
@@ -216,6 +215,7 @@ def test_eagle3_acceptance_length(
"model"
:
model_config
.
drafter
,
"num_speculative_tokens"
:
num_spec_tokens
,
},
attention_config
=
{
"backend"
:
attention_backend
},
tensor_parallel_size
=
tp_size
,
gpu_memory_utilization
=
0.7
,
disable_log_stats
=
False
,
...
...
vllm/config/attention.py
View file @
6437ff1f
...
...
@@ -7,11 +7,8 @@ from pydantic import field_validator
from
pydantic.dataclasses
import
dataclass
from
vllm.config.utils
import
config
from
vllm.logger
import
init_logger
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
logger
=
init_logger
(
__name__
)
@
config
@
dataclass
...
...
@@ -69,46 +66,3 @@ class AttentionConfig:
if
isinstance
(
value
,
str
):
return
AttentionBackendEnum
[
value
.
upper
()]
return
value
def
_set_from_env_if_set
(
self
,
field_name
:
str
,
env_var_name
:
str
)
->
None
:
"""Set field from env var if set, with deprecation warning."""
from
vllm
import
envs
if
envs
.
is_set
(
env_var_name
):
value
=
getattr
(
envs
,
env_var_name
)
if
field_name
==
"backend"
:
value
=
self
.
validate_backend_before
(
value
)
setattr
(
self
,
field_name
,
value
)
logger
.
warning_once
(
"Using %s environment variable is deprecated and will be removed in "
"v0.14.0 or v1.0.0, whichever is soonest. Please use "
"--attention-config.%s command line argument or "
"AttentionConfig(%s=...) config field instead."
,
env_var_name
,
field_name
,
field_name
,
)
def
__post_init__
(
self
)
->
None
:
self
.
_set_from_env_if_set
(
"backend"
,
"VLLM_ATTENTION_BACKEND"
)
self
.
_set_from_env_if_set
(
"flash_attn_version"
,
"VLLM_FLASH_ATTN_VERSION"
)
self
.
_set_from_env_if_set
(
"use_prefill_decode_attention"
,
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
)
self
.
_set_from_env_if_set
(
"flash_attn_max_num_splits_for_cuda_graph"
,
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH"
,
)
self
.
_set_from_env_if_set
(
"use_cudnn_prefill"
,
"VLLM_USE_CUDNN_PREFILL"
)
self
.
_set_from_env_if_set
(
"use_trtllm_ragged_deepseek_prefill"
,
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL"
,
)
self
.
_set_from_env_if_set
(
"use_trtllm_attention"
,
"VLLM_USE_TRTLLM_ATTENTION"
)
self
.
_set_from_env_if_set
(
"disable_flashinfer_prefill"
,
"VLLM_DISABLE_FLASHINFER_PREFILL"
)
self
.
_set_from_env_if_set
(
"disable_flashinfer_q_quantization"
,
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION"
,
)
vllm/envs.py
View file @
6437ff1f
...
...
@@ -20,8 +20,6 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH
:
str
|
None
=
None
LD_LIBRARY_PATH
:
str
|
None
=
None
VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE
:
int
=
256
VLLM_V1_USE_PREFILL_DECODE_ATTENTION
:
bool
=
False
VLLM_FLASH_ATTN_VERSION
:
int
|
None
=
None
LOCAL_RANK
:
int
=
0
CUDA_VISIBLE_DEVICES
:
str
|
None
=
None
VLLM_ENGINE_ITERATION_TIMEOUT_S
:
int
=
60
...
...
@@ -36,7 +34,6 @@ if TYPE_CHECKING:
VLLM_CONFIG_ROOT
:
str
=
os
.
path
.
expanduser
(
"~/.config/vllm"
)
VLLM_USAGE_STATS_SERVER
:
str
=
"https://stats.vllm.ai"
VLLM_NO_USAGE_STATS
:
bool
=
False
VLLM_DISABLE_FLASHINFER_PREFILL
:
bool
=
False
VLLM_DO_NOT_TRACK
:
bool
=
False
VLLM_USAGE_SOURCE
:
str
=
""
VLLM_CONFIGURE_LOGGING
:
bool
=
True
...
...
@@ -48,7 +45,6 @@ if TYPE_CHECKING:
NO_COLOR
:
bool
=
False
VLLM_LOG_STATS_INTERVAL
:
float
=
10.0
VLLM_TRACE_FUNCTION
:
int
=
0
VLLM_ATTENTION_BACKEND
:
str
|
None
=
None
VLLM_USE_FLASHINFER_SAMPLER
:
bool
|
None
=
None
VLLM_PP_LAYER_PARTITION
:
str
|
None
=
None
VLLM_CPU_KVCACHE_SPACE
:
int
|
None
=
0
...
...
@@ -142,7 +138,6 @@ if TYPE_CHECKING:
VLLM_SERVER_DEV_MODE
:
bool
=
False
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
:
int
=
128
VLLM_MLA_DISABLE
:
bool
=
False
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH
:
int
=
32
VLLM_RAY_PER_WORKER_GPUS
:
float
=
1.0
VLLM_RAY_BUNDLE_INDICES
:
str
=
""
VLLM_CUDART_SO_PATH
:
str
|
None
=
None
...
...
@@ -214,15 +209,11 @@ if TYPE_CHECKING:
VLLM_MORIIO_POST_BATCH_SIZE
:
int
=
-
1
VLLM_MORIIO_NUM_WORKERS
:
int
=
1
VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT
:
int
=
480
VLLM_USE_CUDNN_PREFILL
:
bool
=
False
VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL
:
bool
=
False
VLLM_ENABLE_CUDAGRAPH_GC
:
bool
=
False
VLLM_LOOPBACK_IP
:
str
=
""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
True
VLLM_ENABLE_RESPONSES_API_STORE
:
bool
=
False
VLLM_USE_TRTLLM_ATTENTION
:
str
|
None
=
None
VLLM_NVFP4_GEMM_BACKEND
:
str
|
None
=
None
VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION
:
bool
=
False
VLLM_HAS_FLASHINFER_CUBIN
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
:
bool
=
False
...
...
@@ -592,17 +583,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE"
:
lambda
:
int
(
os
.
environ
.
get
(
"VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE"
,
"256"
)
),
# Use separate prefill and decode kernels for V1 attention instead of
# the unified triton kernel.
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
:
lambda
:
(
os
.
getenv
(
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)
),
# Force vllm to use a specific flash-attention version (2 or 3), only valid
# when using the flash-attention backend.
"VLLM_FLASH_ATTN_VERSION"
:
lambda
:
maybe_convert_int
(
os
.
environ
.
get
(
"VLLM_FLASH_ATTN_VERSION"
,
None
)
),
# Feature flag to enable/disable Inductor standalone compile.
# In torch <= 2.7 we ignore this flag; in torch >= 2.9 this is
# enabled by default.
...
...
@@ -668,10 +648,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USAGE_STATS_SERVER"
,
"https://stats.vllm.ai"
),
"VLLM_NO_USAGE_STATS"
:
lambda
:
os
.
environ
.
get
(
"VLLM_NO_USAGE_STATS"
,
"0"
)
==
"1"
,
"VLLM_DISABLE_FLASHINFER_PREFILL"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DISABLE_FLASHINFER_PREFILL"
,
"0"
)
==
"1"
,
"VLLM_DO_NOT_TRACK"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_DO_NOT_TRACK"
,
None
)
or
os
.
environ
.
get
(
"DO_NOT_TRACK"
,
None
)
...
...
@@ -707,25 +683,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set to 1, vllm will trace function calls
# Useful for debugging
"VLLM_TRACE_FUNCTION"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_TRACE_FUNCTION"
,
"0"
)),
# Backend for attention computation
# Example options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "FLASHINFER": use flashinfer
# - "FLASHMLA": use FlashMLA
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
# - "FLASHINFER_MLA": use FlashInfer for MLA
# - "CUTLASS_MLA": use CUTLASS for MLA
# All possible options loaded dynamically from AttentionBackendEnum
"VLLM_ATTENTION_BACKEND"
:
env_with_choices
(
"VLLM_ATTENTION_BACKEND"
,
None
,
lambda
:
list
(
__import__
(
"vllm.v1.attention.backends.registry"
,
fromlist
=
[
"AttentionBackendEnum"
]
).
AttentionBackendEnum
.
__members__
.
keys
()
),
),
# If set, vllm will use flashinfer sampler
"VLLM_USE_FLASHINFER_SAMPLER"
:
lambda
:
bool
(
int
(
os
.
environ
[
"VLLM_USE_FLASHINFER_SAMPLER"
])
...
...
@@ -1127,10 +1084,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the MLA attention optimizations.
"VLLM_MLA_DISABLE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_MLA_DISABLE"
,
"0"
))),
# If set, vLLM will pick up the provided Flash Attention MLA
# max number splits for cuda graph decode
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH"
,
"32"
)
),
# Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU,
# so that users can colocate other actors on the same GPUs as vLLM.
...
...
@@ -1464,26 +1417,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT"
,
"480"
)
),
# Controls whether or not to use cudnn prefill
"VLLM_USE_CUDNN_PREFILL"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_CUDNN_PREFILL"
,
"0"
))
),
# Controls whether to use TRT-LLM ragged DeepSeek prefill
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL"
,
"0"
))
),
# If set to 1/True, use the TRTLLM attention backend in flashinfer.
# If set to 0/False, use the default attention backend in flashinfer.
# If not set, auto-detect the attention backend in flashinfer.
"VLLM_USE_TRTLLM_ATTENTION"
:
lambda
:
(
None
if
"VLLM_USE_TRTLLM_ATTENTION"
not
in
os
.
environ
else
os
.
environ
[
"VLLM_USE_TRTLLM_ATTENTION"
].
lower
()
in
(
"1"
,
"true"
)
),
# If set to 1, when we use fp8 kv, we do not quantize Q to fp8
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION"
,
"0"
))
),
# If set, it means we pre-downloaded cubin files and flashinfer will
# read the cubin files directly.
"VLLM_HAS_FLASHINFER_CUBIN"
:
lambda
:
bool
(
...
...
vllm/platforms/rocm.py
View file @
6437ff1f
...
...
@@ -287,7 +287,10 @@ class RocmPlatform(Platform):
return
AttentionBackendEnum
.
ROCM_AITER_FA
.
get_path
()
# Priority 3: Check for ROCM_ATTN (prefill-decode split)
if
envs
.
VLLM_V1_USE_PREFILL_DECODE_ATTENTION
:
from
vllm.config
import
get_current_vllm_config
vllm_config
=
get_current_vllm_config
()
if
vllm_config
.
attention_config
.
use_prefill_decode_attention
:
logger
.
info
(
"Using Rocm Attention backend."
)
return
AttentionBackendEnum
.
ROCM_ATTN
.
get_path
()
...
...
vllm/usage/usage_lib.py
View file @
6437ff1f
...
...
@@ -37,7 +37,6 @@ _GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
_USAGE_ENV_VARS_TO_COLLECT
=
[
"VLLM_USE_MODELSCOPE"
,
"VLLM_ATTENTION_BACKEND"
,
"VLLM_USE_FLASHINFER_SAMPLER"
,
"VLLM_PP_LAYER_PARTITION"
,
"VLLM_USE_TRITON_AWQ"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment