Unverified Commit fd2f589f authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

[PyTorch] Bump minimum cuDNN version for fused attention with FP8 current scaling (#2236)



* Require cuDNN 9.14.0+ for fused attention with FP8 current scaling
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



---------
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 85a91997
...@@ -469,13 +469,13 @@ def get_attention_backend( ...@@ -469,13 +469,13 @@ def get_attention_backend(
fp8_recipe = fp8_meta["recipe"] fp8_recipe = fp8_meta["recipe"]
if fp8_meta.get("local_recipes", None) is not None: if fp8_meta.get("local_recipes", None) is not None:
fp8_recipe = fp8_meta["local_recipes"][0] fp8_recipe = fp8_meta["local_recipes"][0]
if ( if use_fused_attention and fp8_recipe.float8_current_scaling():
use_fused_attention if device_compute_capability < (10, 0):
and fp8_recipe.float8_current_scaling()
and device_compute_capability < (10, 0)
):
logger.debug("Disabling FusedAttention for FP8 current scaling on arch < sm100") logger.debug("Disabling FusedAttention for FP8 current scaling on arch < sm100")
use_fused_attention = False use_fused_attention = False
elif cudnn_version < (9, 14, 0):
logger.debug("Disabling FusedAttention for FP8 current scaling with cuDNN < 9.14.0")
use_fused_attention = False
# Filter: KV cache # Filter: KV cache
# backend | precision | KV cache | architecture | qkv_format | page_size # backend | precision | KV cache | architecture | qkv_format | page_size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment