a fix

b8e153a4 · aska-0096 · 44be6438 · b8e153a4 · b8e153a4
Commit b8e153a4 authored Mar 27, 2023 by aska-0096
2 changed files
--- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
@@ -87,8 +87,8 @@ using DeviceOpInstance =
                                                                    8,
                                                                    16,
                                                                    16,
-                                                                    8,
+                                                                    4,
-                                                                    1,
+                                                                    2,
                                                                    S<4, 64, 1>,
                                                                    S<1, 0, 2>,
                                                                    S<1, 0, 2>,
@@ -105,7 +105,7 @@ using DeviceOpInstance =
                                                                    true,
                                                                    1,
                                                                    1,
-                                                                    S<1, 16, 1, 16>,
+                                                                    S<1, 32, 1, 8>,
                                                                    8>;
 int main(int argc, char* argv[])

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -151,7 +151,7 @@ __global__ void
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
    defined(__gfx1102__))
    // printf("entry kernel launch");
-    __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -241,7 +241,7 @@ __global__ void
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \
    defined(__gfx1102__))
-    __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size];
    GridwiseOp::template Run<HasMainKBlockLoop>(p_a_grid,
                                                p_b_grid,