Unverified Commit d6464f26 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Chore] Fix torch precision warning (#30428)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent 7e24e5d4
...@@ -152,8 +152,8 @@ def run_tests( ...@@ -152,8 +152,8 @@ def run_tests(
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
else: else:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
# lock matmul precision to full FP32 # lock matmul precision to full FP32 (IEEE)
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
# m.setenv("VLLM_BATCH_INVARIANT", "1") # m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs: list[tuple[str, list, list]] = [] outputs: list[tuple[str, list, list]] = []
for n, ( for n, (
......
...@@ -74,7 +74,7 @@ if TYPE_CHECKING: ...@@ -74,7 +74,7 @@ if TYPE_CHECKING:
VLLM_MEDIA_CONNECTOR: str = "http" VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9" VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
MAX_JOBS: str | None = None MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False VLLM_USE_PRECOMPILED: bool = False
...@@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -456,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
or "12.9", or "12.9",
# Controls PyTorch float32 matmul precision mode within vLLM workers. # Controls PyTorch float32 matmul precision mode within vLLM workers.
# Valid options mirror torch.set_float32_matmul_precision # Accepted values:
# - "ieee" (default): force full IEEE FP32 matmul precision.
# - "tf32": enable TensorFloat32-based fast matmul.
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
"VLLM_FLOAT32_MATMUL_PRECISION", "VLLM_FLOAT32_MATMUL_PRECISION",
"highest", "ieee",
["highest", "high", "medium"], ["ieee", "tf32"],
case_sensitive=False, case_sensitive=False,
), ),
# Maximum number of compilation jobs to run in parallel. # Maximum number of compilation jobs to run in parallel.
......
...@@ -81,7 +81,7 @@ class Worker(WorkerBase): ...@@ -81,7 +81,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env. # configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.set_float32_matmul_precision(precision) torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code: if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing # note: lazy import to avoid importing torch before initializing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment