Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e1710393
Unverified
Commit
e1710393
authored
Nov 12, 2025
by
wangxiyuan
Committed by
GitHub
Nov 11, 2025
Browse files
[[V0 deprecation]]Remove VLLM_USE_V1 env (#28204)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
3f770f44
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
15 additions
and
59 deletions
+15
-59
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+1
-1
examples/offline_inference/mlpspeculator.py
examples/offline_inference/mlpspeculator.py
+1
-2
examples/offline_inference/qwen2_5_omni/README.md
examples/offline_inference/qwen2_5_omni/README.md
+0
-2
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+1
-6
examples/others/lmcache/cpu_offload_lmcache.py
examples/others/lmcache/cpu_offload_lmcache.py
+12
-31
tests/entrypoints/openai/test_orca_metrics.py
tests/entrypoints/openai/test_orca_metrics.py
+0
-3
vllm/envs.py
vllm/envs.py
+0
-13
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+0
-1
No files found.
.buildkite/scripts/hardware_ci/run-cpu-test.sh
View file @
e1710393
...
@@ -76,7 +76,7 @@ function cpu_tests() {
...
@@ -76,7 +76,7 @@ function cpu_tests() {
# Run AWQ test
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
# set -e
#
VLLM_USE_V1=0
pytest -x -s -v \
# pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
# Run multi-lora tests
...
...
examples/offline_inference/mlpspeculator.py
View file @
e1710393
...
@@ -4,8 +4,7 @@
...
@@ -4,8 +4,7 @@
This file demonstrates the usage of text generation with an LLM model,
This file demonstrates the usage of text generation with an LLM model,
comparing the performance with and without speculative decoding.
comparing the performance with and without speculative decoding.
Note that still not support `v1`:
Note that this example is out of date and not supported in vLLM v1.
VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
"""
"""
import
gc
import
gc
...
...
examples/offline_inference/qwen2_5_omni/README.md
View file @
e1710393
...
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
...
@@ -11,12 +11,10 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
# Read vision and audio inputs from a single video file
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1
=
0
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
-q
use_audio_in_video
-q
use_audio_in_video
# Multiple audios
# Multiple audios
VLLM_USE_V1
=
0
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
python examples/offline_inference/qwen2_5_omni/only_thinker.py
\
-q
multi_audios
-q
multi_audios
```
```
...
...
examples/offline_inference/qwen2_5_omni/only_thinker.py
View file @
e1710393
...
@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
...
@@ -7,7 +7,6 @@ with the correct prompt format on Qwen2.5-Omni (thinker only).
from
typing
import
NamedTuple
from
typing
import
NamedTuple
import
vllm.envs
as
envs
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
...
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
...
@@ -72,11 +71,7 @@ def get_use_audio_in_video_query() -> QueryResult:
)
)
asset
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
)
asset
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
)
audio
=
asset
.
get_audio
(
sampling_rate
=
16000
)
audio
=
asset
.
get_audio
(
sampling_rate
=
16000
)
assert
not
envs
.
VLLM_USE_V1
,
(
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return
QueryResult
(
return
QueryResult
(
inputs
=
{
inputs
=
{
"prompt"
:
prompt
,
"prompt"
:
prompt
,
...
...
examples/others/lmcache/cpu_offload_lmcache.py
View file @
e1710393
...
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
...
@@ -37,7 +37,7 @@ from vllm.config import KVTransferConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
def
setup_environment_variables
(
vllm_version
:
str
):
def
setup_environment_variables
():
# LMCache-related environment variables
# LMCache-related environment variables
# Use experimental features in LMCache
# Use experimental features in LMCache
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
os
.
environ
[
"LMCACHE_USE_EXPERIMENTAL"
]
=
"True"
...
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
...
@@ -47,12 +47,10 @@ def setup_environment_variables(vllm_version: str):
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
os
.
environ
[
"LMCACHE_LOCAL_CPU"
]
=
"True"
# Set local CPU memory limit to 5.0 GB
# Set local CPU memory limit to 5.0 GB
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
os
.
environ
[
"LMCACHE_MAX_LOCAL_CPU_SIZE"
]
=
"5.0"
if
vllm_version
==
"v0"
:
os
.
environ
[
"VLLM_USE_V1"
]
=
"0"
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
,
vllm_version
:
str
):
def
build_llm_with_lmcache
(
lmcache_connector
:
str
,
model
:
str
):
ktc
=
KVTransferConfig
(
ktc
=
KVTransferConfig
(
kv_connector
=
lmcache_connector
,
kv_connector
=
lmcache_connector
,
kv_role
=
"kv_both"
,
kv_role
=
"kv_both"
,
...
@@ -60,15 +58,6 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
...
@@ -60,15 +58,6 @@ def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
# memory. Reduce the value if your GPU has less memory.
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
if
vllm_version
==
"v0"
:
llm_args
=
EngineArgs
(
model
=
model
,
kv_transfer_config
=
ktc
,
max_model_len
=
8000
,
gpu_memory_utilization
=
0.8
,
enable_chunked_prefill
=
True
,
# Only in v0
)
else
:
llm_args
=
EngineArgs
(
llm_args
=
EngineArgs
(
model
=
model
,
model
=
model
,
kv_transfer_config
=
ktc
,
kv_transfer_config
=
ktc
,
...
@@ -116,18 +105,10 @@ def parse_args():
...
@@ -116,18 +105,10 @@ def parse_args():
def
main
():
def
main
():
args
=
parse_args
()
if
args
.
version
==
"v0"
:
lmcache_connector
=
"LMCacheConnector"
model
=
"mistralai/Mistral-7B-Instruct-v0.2"
else
:
lmcache_connector
=
"LMCacheConnectorV1"
lmcache_connector
=
"LMCacheConnectorV1"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
setup_environment_variables
()
setup_environment_variables
(
args
.
version
)
with
build_llm_with_lmcache
(
lmcache_connector
,
model
)
as
llm
:
with
build_llm_with_lmcache
(
lmcache_connector
,
model
,
args
.
version
)
as
llm
:
# This example script runs two requests with a shared prefix.
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
# Define the shared prompt and specific prompts
shared_prompt
=
"Hello, how are you?"
*
1000
shared_prompt
=
"Hello, how are you?"
*
1000
...
...
tests/entrypoints/openai/test_orca_metrics.py
View file @
e1710393
...
@@ -22,9 +22,6 @@ def monkeypatch_module():
...
@@ -22,9 +22,6 @@ def monkeypatch_module():
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
])
def
server
(
request
,
monkeypatch_module
):
def
server
(
request
,
monkeypatch_module
):
use_v1
=
request
.
param
monkeypatch_module
.
setenv
(
"VLLM_USE_V1"
,
"1"
if
use_v1
else
"0"
)
args
=
[
args
=
[
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
...
...
vllm/envs.py
View file @
e1710393
...
@@ -100,7 +100,6 @@ if TYPE_CHECKING:
...
@@ -100,7 +100,6 @@ if TYPE_CHECKING:
VLLM_SKIP_P2P_CHECK
:
bool
=
False
VLLM_SKIP_P2P_CHECK
:
bool
=
False
VLLM_DISABLED_KERNELS
:
list
[
str
]
=
[]
VLLM_DISABLED_KERNELS
:
list
[
str
]
=
[]
VLLM_DISABLE_PYNCCL
:
bool
=
False
VLLM_DISABLE_PYNCCL
:
bool
=
False
VLLM_USE_V1
:
bool
=
True
VLLM_ROCM_USE_AITER
:
bool
=
False
VLLM_ROCM_USE_AITER
:
bool
=
False
VLLM_ROCM_USE_AITER_PAGED_ATTN
:
bool
=
False
VLLM_ROCM_USE_AITER_PAGED_ATTN
:
bool
=
False
VLLM_ROCM_USE_AITER_LINEAR
:
bool
=
True
VLLM_ROCM_USE_AITER_LINEAR
:
bool
=
True
...
@@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -884,8 +883,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_DISABLE_PYNCCL"
:
lambda
:
(
"VLLM_DISABLE_PYNCCL"
:
lambda
:
(
os
.
getenv
(
"VLLM_DISABLE_PYNCCL"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)
os
.
getenv
(
"VLLM_DISABLE_PYNCCL"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)
),
),
# If set, use the V1 code path.
"VLLM_USE_V1"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_V1"
,
"1"
))),
# Disable aiter ops unless specifically enabled.
# Disable aiter ops unless specifically enabled.
# Acts as a parent switch to enable the rest of the other operations.
# Acts as a parent switch to enable the rest of the other operations.
"VLLM_ROCM_USE_AITER"
:
lambda
:
(
"VLLM_ROCM_USE_AITER"
:
lambda
:
(
...
@@ -1538,16 +1535,6 @@ def is_set(name: str):
...
@@ -1538,16 +1535,6 @@ def is_set(name: str):
raise
AttributeError
(
f
"module
{
__name__
!
r
}
has no attribute
{
name
!
r
}
"
)
raise
AttributeError
(
f
"module
{
__name__
!
r
}
has no attribute
{
name
!
r
}
"
)
def
set_vllm_use_v1
(
use_v1
:
bool
):
if
is_set
(
"VLLM_USE_V1"
):
raise
ValueError
(
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
"explicitly by the user. Please raise this as a Github "
"Issue and explicitly set VLLM_USE_V1=0 or 1."
)
os
.
environ
[
"VLLM_USE_V1"
]
=
"1"
if
use_v1
else
"0"
def
compute_hash
()
->
str
:
def
compute_hash
()
->
str
:
"""
"""
WARNING: Whenever a new key is added to this environment
WARNING: Whenever a new key is added to this environment
...
...
vllm/usage/usage_lib.py
View file @
e1710393
...
@@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
...
@@ -42,7 +42,6 @@ _USAGE_ENV_VARS_TO_COLLECT = [
"VLLM_USE_FLASHINFER_SAMPLER"
,
"VLLM_USE_FLASHINFER_SAMPLER"
,
"VLLM_PP_LAYER_PARTITION"
,
"VLLM_PP_LAYER_PARTITION"
,
"VLLM_USE_TRITON_AWQ"
,
"VLLM_USE_TRITON_AWQ"
,
"VLLM_USE_V1"
,
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
]
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment