sanity pass

64877b2c · aska-0096 · 7761e523 · 64877b2c · 64877b2c · 64877b2c
Commit 64877b2c authored Aug 01, 2023 by aska-0096
3 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
@@ -692,8 +692,16 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
        constexpr auto WmmaK = 16;
        constexpr auto KPack = math::integer_least_multiple(K1, WmmaK);

+        // In inline asm mode, you can choose either
+        // 1. BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO
+        //    This one generate clear assembly code, but performance is low.
+        // Or
+        // 2. BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle
+        //    This one have redundant "ds_load" instruction because compiler have limitation in
+        //    optimize code with inline assembly. Though the generated code has more line of code, 
+        //    performance is high.
        auto blockwise_gemm =
-            BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle<BlockSize,
+            BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO<BlockSize,
                                                         ADataType,
                                                         BDataType,
                                                         AccDataType,

--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -358,7 +358,7 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
 // Ranged input operand
 __device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, half16_t b, float8_t& c)
 {
-#if defined(__gfx11__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
    asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" : "=v"(c) : "v"(a), "v"(b), "0"(c));
 #else
    ignore = a;

--- a/include/ck/utility/amd_wmma.hpp
+++ b/include/ck/utility/amd_wmma.hpp
@@ -23,11 +23,11 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16>
    {
        // * Inline assembly need to elimate the duplicated data load, compiler won't help you
        // delete them.
-        // amd_assembly_wmma_f32_16x16x16_f16_w32(
-        //     reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
+        amd_assembly_wmma_f32_16x16x16_f16_w32(
+            reg_a, reg_b, reg_c.template AsType<float8_t>()(Number<0>{}));
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
-        reg_c.template AsType<float8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
-            reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+        // reg_c.template AsType<float8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
+            // reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
 #else
        ignore = reg_a;
        ignore = reg_b;