"...git@developer.sourcefind.cn:OpenDAS/TransformerEngine.git" did not exist on "b4ef463d40331f389791eddde328c504e371ce8f"
Unverified Commit de81b7df authored by Kshitij Lakhani's avatar Kshitij Lakhani Committed by GitHub
Browse files

Further relax constraints to cuDNN 9.13 for disabling fused attn for kv caching (#2121)


Signed-off-by: default avatarKshitij Lakhani <klakhani@nvidia.com>
parent 1e2c68d6
...@@ -434,8 +434,8 @@ def get_attention_backend( ...@@ -434,8 +434,8 @@ def get_attention_backend(
# | FP8 | non-paged/paged | sm90 | thd | >= 1 # | FP8 | non-paged/paged | sm90 | thd | >= 1
# Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1 # Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1
if inference_params is not None: if inference_params is not None:
if device_compute_capability == (8, 9) and cudnn_version <= (9, 12, 0): if device_compute_capability == (8, 9) and cudnn_version <= (9, 13, 0):
logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.12") logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.13")
use_fused_attention = False use_fused_attention = False
if context_parallel: if context_parallel:
logger.debug("Disabling all backends for KV caching with context parallelism") logger.debug("Disabling all backends for KV caching with context parallelism")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment