"googlemock/include/vscode:/vscode.git/clone" did not exist on "9311242db422dd6f24c8e764847fe5d70d0d4859"
Commit 1d1e8efe authored by Kshitij Lakhani's avatar Kshitij Lakhani Committed by Kshitij Janardan Lakhani
Browse files

Further relax constraints to cuDNN 9.13 for disabling fused attn for kv caching (#2121)


Signed-off-by: default avatarKshitij Lakhani <klakhani@nvidia.com>
parent d7874aad
...@@ -434,8 +434,8 @@ def get_attention_backend( ...@@ -434,8 +434,8 @@ def get_attention_backend(
# | FP8 | non-paged/paged | sm90 | thd | >= 1 # | FP8 | non-paged/paged | sm90 | thd | >= 1
# Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1 # Unfused | FP32/FP16/BF16 | non-paged/paged | all | bshd,sbhd,thd | >= 1
if inference_params is not None: if inference_params is not None:
if device_compute_capability == (8, 9) and cudnn_version <= (9, 12, 0): if device_compute_capability == (8, 9) and cudnn_version <= (9, 13, 0):
logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.12") logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.13")
use_fused_attention = False use_fused_attention = False
if context_parallel: if context_parallel:
logger.debug("Disabling all backends for KV caching with context parallelism") logger.debug("Disabling all backends for KV caching with context parallelism")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment