Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e1710393
Unverified
Commit
e1710393
authored
Nov 12, 2025
by
wangxiyuan
Committed by
GitHub
Nov 11, 2025
Browse files
[[V0 deprecation]]Remove VLLM_USE_V1 env (#28204)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
3f770f44
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
15 additions
and
59 deletions
+15
-59
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+1
-1
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+1
-2
examples/offline_inference/qwen2_5_omni/README.md
examples/offline_inference/qwen2_5_omni/README.md
+0
-2
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+1
-6
examples/others/lmcache/cpu_offload_lmcache.py
examples/others/lmcache/cpu_offload_lmcache.py
+12
-31
tests/entrypoints/openai/test_orca_metrics.py
tests/entrypoints/openai/test_orca_metrics.py
+0
-3
vllm/envs.py
vllm/envs.py
+0
-13
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+0
-1
No files found.
.buildkite/scripts/hardware_ci/run-cpu-test.sh
View file @
e1710393
...
...
@@ -76,7 +76,7 @@ function cpu_tests() {
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
#
VLLM_USE_V1=0
pytest -x -s -v \
# pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
...
...
examples/offline_inference/mlpspeculator.py
View file @
e1710393
...
...
@@ -4,8 +4,7 @@
This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding.
Note that still not support `v1`:
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
Note that this example is out of date and not supported in vLLM v1.
"""
import
gc
...
...
examples/offline_inference/qwen2_5_omni/README.md
View file @
e1710393
...
...
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1
=
0
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
-q
use_audio_in_video
# Multiple audios
VLLM_USE_V1
=
0
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
-q
multi_audios
```
...
...
examples/offline_inference/qwen2_5_omni/only_thinker.py
View file @
e1710393
...
...
@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
from
typing
import
NamedTuple
import
vllm.envs
as
envs
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
...
...
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
)
asset
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
)
audio
=
asset
.
get_audio
(
sampling_rate
=
16000
)
assert
not
envs
.
VLLM_USE_V1
,
(
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return
QueryResult
(
inputs
=
{
"prompt"
:
prompt
,
...
...
examples/others/lmcache/cpu_offload_lmcache.py
View file @
e1710393
...
...
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from
vllm.engine.arg_utils
import
EngineArgs
def
setup_environment_variables
(
vllm_version
:
str
):
def
setup_environment_variables
():
# LMCache-related environment variables
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
...
...
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
# Set local CPU memory limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
if
vllm_version
==
"v0"
:
os
.
environ
[
"VLLM_USE_V1"
]
=
"0"
@
contextlib
.
contextmanager
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
,
vllm_version
:
str
):
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
):
ktc
=
KVTransferConfig
(
kv_connector
=
lmcache_connector
,
kv_role
=
"kv_both"
,
...
...
@@ -60,21 +58,12 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if
vllm_version
==
"v0"
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enable_chunked_prefill
=
True
,
# Only in v0
)
else
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
)
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
)
llm
=
LLM
(
**
asdict
(
llm_args
))
try
:
...
...
@@ -116,18 +105,10 @@ def parse_args():
def
main
():
args
=
parse_args
()
if
args
.
version
==
"v0"
:
lmcache_connector
=
"LMCacheConnector"
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
else
:
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables
(
args
.
version
)
with
build_llm_with_lmcache
(
lmcache_connector
,
model
,
args
.
version
)
as
llm
:
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables
()
with
build_llm_with_lmcache
(
lmcache_connector
,
model
)
as
llm
:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt
=
"Hello, how are you?"
*
1000
...
...
tests/entrypoints/openai/test_orca_metrics.py
View file @
e1710393
...
...
@@ -22,9 +22,6 @@ def monkeypatch_module():
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
def
server
(
request
,
monkeypatch_module
):
use_v1
=
request
.
param
monkeypatch_module
.
setenv
(
"VLLM_USE_V1"
,
"1"
if
use_v1
else
"0"
)
args
=
[
"--dtype"
,
"bfloat16"
,
...
...
vllm/envs.py
View file @
e1710393
...
...
@@ -100,7 +100,6 @@ if TYPE_CHECKING:
VLLM_SKIP_P2P_CHECK
:
bool
=
False
VLLM_DISABLED_KERNELS
:
list
[
str
]
=
[]
VLLM_DISABLE_PYNCCL
:
bool
=
False
VLLM_USE_V1
:
bool
=
True
VLLM_ROCM_USE_AITER
:
bool
=
False
VLLM_ROCM_USE_AITER_PAGED_ATTN
:
bool
=
False
VLLM_ROCM_USE_AITER_LINEAR
:
bool
=
True
...
...
@@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLE_PYNCCL"
:
lambda
:
(
os
.
getenv
(
"VLLM_DISABLE_PYNCCL"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)
),
# If set, use the V1 code path.
"VLLM_USE_V1"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_V1"
,
"1"
))),
# Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations.
"VLLM_ROCM_USE_AITER"
:
lambda
:
(
...
...
@@ -1538,16 +1535,6 @@ def is_set(name: str):
raise
AttributeError
(
f
"module
{
__name__
!
r
}
has no attribute
{
name
!
r
}
"
)
def
set_vllm_use_v1
(
use_v1
:
bool
):
if
is_set
(
"VLLM_USE_V1"
):
raise
ValueError
(
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
"explicitly by the user. Please raise this as a Github "
"Issue and explicitly set VLLM_USE_V1=0 or 1."
)
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
if
use_v1
else
"0"
def
compute_hash
()
->
str
:
"""
WARNING: Whenever a new key is added to this environment
...
...
vllm/usage/usage_lib.py
View file @
e1710393
...
...
@@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
"VLLM_USE_FLASHINFER_SAMPLER"
,
"VLLM_PP_LAYER_PARTITION"
,
"VLLM_USE_TRITON_AWQ"
,
"VLLM_USE_V1"
,
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment