Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d6464f26
Unverified
Commit
d6464f26
authored
Dec 10, 2025
by
Wentao Ye
Committed by
GitHub
Dec 11, 2025
Browse files
[Chore] Fix torch precision warning (#30428)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
7e24e5d4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
7 deletions
+9
-7
tests/v1/e2e/test_async_scheduling.py
tests/v1/e2e/test_async_scheduling.py
+2
-2
vllm/envs.py
vllm/envs.py
+6
-4
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+1
-1
No files found.
tests/v1/e2e/test_async_scheduling.py
View file @
d6464f26
...
...
@@ -152,8 +152,8 @@ def run_tests(
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_FA"
)
else
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
# lock matmul precision to full FP32
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"
highest
"
)
# lock matmul precision to full FP32
(IEEE)
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"
ieee
"
)
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs
:
list
[
tuple
[
str
,
list
,
list
]]
=
[]
for
n
,
(
...
...
vllm/envs.py
View file @
d6464f26
...
...
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
VLLM_MEDIA_CONNECTOR
:
str
=
"http"
VLLM_TARGET_DEVICE
:
str
=
"cuda"
VLLM_MAIN_CUDA_VERSION
:
str
=
"12.9"
VLLM_FLOAT32_MATMUL_PRECISION
:
Literal
[
"
highest"
,
"high"
,
"medium"
]
=
"highest
"
VLLM_FLOAT32_MATMUL_PRECISION
:
Literal
[
"
ieee"
,
"tf32"
]
=
"ieee
"
MAX_JOBS
:
str
|
None
=
None
NVCC_THREADS
:
str
|
None
=
None
VLLM_USE_PRECOMPILED
:
bool
=
False
...
...
@@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAIN_CUDA_VERSION"
:
lambda
:
os
.
getenv
(
"VLLM_MAIN_CUDA_VERSION"
,
""
).
lower
()
or
"12.9"
,
# Controls PyTorch float32 matmul precision mode within vLLM workers.
# Valid options mirror torch.set_float32_matmul_precision
# Accepted values:
# - "ieee" (default): force full IEEE FP32 matmul precision.
# - "tf32": enable TensorFloat32-based fast matmul.
"VLLM_FLOAT32_MATMUL_PRECISION"
:
env_with_choices
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"
highest
"
,
[
"
highest"
,
"high"
,
"medium
"
],
"
ieee
"
,
[
"
ieee"
,
"tf32
"
],
case_sensitive
=
False
,
),
# Maximum number of compilation jobs to run in parallel.
...
...
vllm/v1/worker/gpu_worker.py
View file @
d6464f26
...
...
@@ -81,7 +81,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env.
precision
=
envs
.
VLLM_FLOAT32_MATMUL_PRECISION
torch
.
set_float32_
matmul_precision
(
precision
)
torch
.
backends
.
cuda
.
matmul
.
fp32
_precision
=
precision
if
self
.
model_config
.
trust_remote_code
:
# note: lazy import to avoid importing torch before initializing
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment