[V1] Enable V1 for compute capability < 8.0 + FP32 (#23614)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[V1] Enable V1 for compute capability < 8.0 + FP32 (#23614)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
50fede66 · Cyrus Leung · GitHub · b5d34af3 · 50fede66
Unverified Commit 50fede66 authored Aug 26, 2025 by Cyrus Leung Committed by GitHub Aug 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 8 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +8 -8

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1433,15 +1433,15 @@ class EngineArgs:
                               recommend_to_remove=True)
            return False

-        # Need at least Ampere for now (FA support required).
-        # Skip this check if we are running on a non-GPU platform,
-        # or if the device capability is not available
-        # (e.g. in a Ray actor without GPUs).
+        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
+        # which broke fp16 inference
+        # see: https://github.com/triton-lang/triton/issues/6698
        if (current_platform.is_cuda()
-                and current_platform.get_device_capability()
-                and current_platform.get_device_capability().major < 8):
-            _raise_or_fallback(feature_name="Compute Capability < 8.0",
-                               recommend_to_remove=False)
+                and not current_platform.has_device_capability(80)
+                and model_config.dtype == torch.float16):
+            _raise_or_fallback(
+                feature_name="Compute Capability < 8.0 with FP16",
+                recommend_to_remove=False)
            return False

        if self.kv_cache_dtype != "auto":