[Fix] Set max dynamic smem size for decoder MHA to support context length > 8k (#377)

* Fix crash when context window size is large by setting max dynamic smem size * fix linting

[Fix] Set max dynamic smem size for decoder MHA to support context length > 8k (#377)
* Fix crash when context window size is large by setting max dynamic smem size * fix linting
71ade772 · Lyu Han · GitHub · 57cf99b9 · 71ade772
Unverified Commit 71ade772 authored Sep 07, 2023 by Lyu Han Committed by GitHub Sep 07, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 8 deletions

src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu ...ihead_attention/decoder_masked_multihead_attention_128.cu +10 -8

No files found.
--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
@@ -28,16 +28,18 @@

 #define MMHA_LAUNCH_KERNEL(                                                                                            \
    T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS, QUANT_POLICY, stream)                      \
+    auto   func    = &mmha::masked_multihead_attention_kernel<T,                                                       \
+                                                         Dh,                                                      \
+                                                         Dh_MAX,                                                  \
+                                                         THDS_PER_KEY,                                            \
+                                                         THDS_PER_VALUE,                                          \
+                                                         THDS_PER_BLOCK,                                          \
+                                                         HAS_BEAMS,                                               \
+                                                         QUANT_POLICY>;                                           \
    size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK);                              \
    dim3   grid(params.num_heads, params.batch_size);                                                                  \
-    mmha::masked_multihead_attention_kernel<T,                                                                         \
-                                            Dh,                                                                        \
-                                            Dh_MAX,                                                                    \
-                                            THDS_PER_KEY,                                                              \
-                                            THDS_PER_VALUE,                                                            \
-                                            THDS_PER_BLOCK,                                                            \
-                                            HAS_BEAMS,                                                                 \
-                                            QUANT_POLICY><<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+    cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz);                                  \
+    func<<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)

 ////////////////////////////////////////////////////////////////////////////////////////////////////