change k=64 config

ac407086 · ltqin · a465a936 · ac407086 · ac407086 · ac407086
Commit ac407086 authored Feb 27, 2023 by ltqin
3 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_fp16.cpp
@@ -25,7 +25,7 @@ Kernel outputs:
 #define PRINT_HOST 0
 #define USING_MASK 1
-#define USING_K128 1
+#define USING_K128 0
 #include <iostream>
 #include <numeric>
@@ -213,7 +213,7 @@ using DeviceGemmInstance =
        S<0, 2, 1>,
        S<0, 2, 1>,
        1,
-        4,
+        2,
        2,
        false,
        1,              // CShuffleMXdlPerWavePerShuffle
@@ -340,8 +340,8 @@ int run(int argc, char* argv[])
    ck::index_t K  = 64;
    ck::index_t O  = 64;
 #endif
-    ck::index_t G0 = 54;
+    ck::index_t G0 = 3;
-    ck::index_t G1 = 16;
+    ck::index_t G1 = 2;
    float alpha = 1.f / std::sqrt(K);

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle.hpp
@@ -49,7 +49,7 @@ template <typename GridwiseGemm,
          bool HasMainKBlockLoop>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, /*CK_MIN_BLOCK_PER_CU*/1)
 #endif
        kernel_batched_multihead_attention_backward_xdl_cshuffle_v2(
            const DataType* __restrict__ p_a_grid,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
@@ -90,7 +90,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
    {
        using type = T;
    };
-#if defined(__gfx90a__)
+#if defined(__gfx90a_masking__)
    template <>
    struct TypeMap<ck::half_t>
    {