Unverified Commit de81b7df authored by Kshitij Lakhani's avatar Kshitij Lakhani Committed by GitHub
Browse files

Further relax constraints to cuDNN 9.13 for disabling fused attn for kv caching (#2121)


Signed-off-by: default avatarKshitij Lakhani <klakhani@nvidia.com>
parent 1e2c68d6
...@@ -434,8 +434,8 @@ def get_attention_backend( ...@@ -434,8 +434,8 @@ def get_attention_backend(
# | FP8 | non-paged/paged | sm90 | thd | >= 1 # | FP8 | non-paged/paged | sm90 | thd | >= 1
# Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1 # Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1
if inference_params is not None: if inference_params is not None:
if device_compute_capability == (8, 9) and cudnn_version <= (9, 12, 0): if device_compute_capability == (8, 9) and cudnn_version <= (9, 13, 0):
logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.12") logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.13")
use_fused_attention = False use_fused_attention = False
if context_parallel: if context_parallel:
logger.debug("Disabling all backends for KV caching with context parallelism") logger.debug("Disabling all backends for KV caching with context parallelism")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment