[Bug] Revert torch warning fix (#31585)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Bug] Revert torch warning fix (#31585)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
af9a7ec2 · Wentao Ye · GitHub · 276e03b9 · af9a7ec2 · af9a7ec2
Unverified Commit af9a7ec2 authored Jan 05, 2026 by Wentao Ye Committed by GitHub Jan 05, 2026
Showing with 6 additions and 8 deletions

tests/v1/e2e/test_async_scheduling.py tests/v1/e2e/test_async_scheduling.py +1 -1

vllm/envs.py vllm/envs.py +4 -6

vllm/v1/worker/gpu_worker.py vllm/v1/worker/gpu_worker.py +1 -1

No files found.
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -154,7 +154,7 @@ def run_tests(

    with monkeypatch.context() as m:
        # lock matmul precision to full FP32 (IEEE)
-        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
        # m.setenv("VLLM_BATCH_INVARIANT", "1")
        outputs: list[tuple[str, list, list]] = []
        for n, (

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,7 +75,7 @@ if TYPE_CHECKING:
    VLLM_MEDIA_CONNECTOR: str = "http"
    VLLM_TARGET_DEVICE: str = "cuda"
    VLLM_MAIN_CUDA_VERSION: str = "12.9"
-    VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
+    VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
    MAX_JOBS: str | None = None
    NVCC_THREADS: str | None = None
    VLLM_USE_PRECOMPILED: bool = False
@@ -459,13 +459,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
    or "12.9",
    # Controls PyTorch float32 matmul precision mode within vLLM workers.
-    # Accepted values:
-    #   - "ieee" (default): force full IEEE FP32 matmul precision.
-    #   - "tf32": enable TensorFloat32-based fast matmul.
+    # Valid options mirror torch.set_float32_matmul_precision
    "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
        "VLLM_FLOAT32_MATMUL_PRECISION",
-        "ieee",
-        ["ieee", "tf32"],
+        "highest",
+        ["highest", "high", "medium"],
        case_sensitive=False,
    ),
    # Maximum number of compilation jobs to run in parallel.

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -84,7 +84,7 @@ class Worker(WorkerBase):

        # configure float32 matmul precision according to vLLM env.
        precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
-        torch.backends.cuda.matmul.fp32_precision = precision
+        torch.set_float32_matmul_precision(precision)

        if self.model_config.trust_remote_code:
            # note: lazy import to avoid importing torch before initializing