fixed layout

2052dfc9 · Jing Zhang · f221c68e · 2052dfc9 · 2052dfc9 · 2052dfc9
Commit 2052dfc9 authored Feb 29, 2024 by Jing Zhang
4 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp
@@ -416,7 +416,7 @@ struct BlockwiseGemmWMMA
    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor(
        make_tuple(Number<KPack / A_K1 / A_KRow>{}, Number<MRepeat>{}, I1, I1, I1, Number<A_K1>{}),
        make_tuple(Number<A_K1>{},
-                   Number<KPack / A_KRow>{},
+                   Number<A_KRow * A_K1>{},
                   Number<A_K1>{},
                   Number<A_K1>{},
                   Number<A_K1>{},
@@ -425,7 +425,7 @@ struct BlockwiseGemmWMMA
    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor(
        make_tuple(Number<KPack / B_K1 / B_KRow>{}, Number<NRepeat>{}, I1, I1, I1, Number<B_K1>{}),
        make_tuple(Number<B_K1>{},
-                   Number<KPack / B_KRow>{},
+                   Number<B_KRow * B_K1>{},
                   Number<B_K1>{},
                   Number<B_K1>{},
                   Number<B_K1>{},

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
@@ -97,8 +97,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
    static constexpr auto AEnableLds_manu = false;
    static constexpr auto BEnableLds_manu = false;

-    static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
-    static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);
+    static constexpr auto AEnableLds =
+        true; // AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1);
+    static constexpr auto BEnableLds =
+        true; // BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1);

    static constexpr auto matrix_padder =
        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
@@ -135,7 +135,7 @@ struct GridwiseGemm_Wmma

    static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma);
    static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
-    static constexpr auto WmmaK  = K1 == 16 ? 32 : 16;
+    static constexpr auto WmmaK  = (K1 == 16) ? 32 : 16;

    using ThisThreadBlock = ThisThreadBlock<BlockSize>;


--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0  -fPIE  -Wno-gnu-line-marker"     \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS="gfx908;gfx90a;gfx940"                                                             \
+-D GPU_TARGETS="gfx1200"                                                             \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}