disable asm for debug

fe6f3744 · Jing Zhang · 46a0aec1 · fe6f3744 · fe6f3744 · fe6f3744
Commit fe6f3744 authored Apr 26, 2019 by Jing Zhang
4 changed files
--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -413,8 +413,8 @@ int main(int argc, char* argv[])
 {
 #if 1
    // 3x3, 34x34
-    constexpr index_t N  = 64;
+    constexpr index_t N  = 16;
-    constexpr index_t C  = 256;
+    constexpr index_t C  = 8;
    constexpr index_t HI = 34;
    constexpr index_t WI = 34;
    constexpr index_t K  = 128;

--- a/src/include/amd_inline_asm.hip.hpp
+++ b/src/include/amd_inline_asm.hip.hpp
@@ -46,7 +46,7 @@ __device__ void vmcnt(index_t cnt)
 __device__ void lgkmcnt(index_t cnt)
 {
-#if !NO_LGKM_WAIT
+#if 0 
    if(cnt == 0)
    {
        asm volatile("\n \
@@ -86,6 +86,7 @@ __device__ void lgkmcnt(index_t cnt)
 __device__ void outerProduct1x4(const float* a, const float* b, float* c)
 {
+#if 0
    asm volatile("\n \
            v_mac_f32 %0, %4, %5 \n \
            v_mac_f32 %1, %4, %6 \n \
@@ -102,6 +103,12 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c)
                   "1"(c[1]),
                   "2"(c[2]),
                   "3"(c[3]));
+#else
+    c[0] += a[0] * b[0];
+    c[1] += a[0] * b[1];
+    c[2] += a[0] * b[2];
+    c[3] += a[0] * b[3];
+#endif
 }
 __device__ void outerProduct1x4(const float& a,
@@ -197,7 +204,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
 __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
 {
-#if !NO_DS_READ
+#if 0 
    if(offset == 0)
    {
        asm volatile("\n \
@@ -418,6 +425,9 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
    {
        assert(false);
    }
+#else
+    using Float4 = vector_type<float, 4>::MemoryType;
+    r = ((Float4*)lds)[offset]; 
 #endif
 }

--- a/src/include/blockwise_batched_gemm.hip.hpp
+++ b/src/include/blockwise_batched_gemm.hip.hpp
@@ -435,35 +435,48 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        constexpr index_t a_lds_cluster_col_stride = sizeof(Float) * MPerLevel1Cluster;
        constexpr index_t b_lds_cluster_col_stride = sizeof(Float) * NPerLevel1Cluster;
-        ds_read_b128(reg_a[0], a_lds_loc, 0);
+        for(index_t k = 0; k < K; k++)
-        ds_read_b128(reg_b[0], b_lds_loc, 0);
-        ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
-        ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
-        lgkmcnt(2);
-        outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-        lgkmcnt(1);
-        outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
-#pragma unroll
-        for(index_t k = 1; k < K; ++k)
        {
            ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
-            lgkmcnt(1);
-            outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
            ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
-            outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
            ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
            ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
-            lgkmcnt(2);
            outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
-            lgkmcnt(1);
            outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
-        }
-        lgkmcnt(0);
            outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
            outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
        }
+        //ds_read_b128(reg_a[0], a_lds_loc, 0);
+        //ds_read_b128(reg_b[0], b_lds_loc, 0);
+        //ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
+        //ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
+        //lgkmcnt(2);
+        //outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
+        //lgkmcnt(1);
+        //outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
+//#pragma unroll
+        //for(index_t k = 1; k < K; ++k)
+        //{
+            //ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
+            //lgkmcnt(1);
+            //outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
+            //ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
+            //outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
+            //ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
+            //ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
+            //lgkmcnt(2);
+            //outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
+            //lgkmcnt(1);
+            //outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
+        //}
+        //lgkmcnt(0);
+        //outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
+        //outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
+    }
 #endif
    template <class BlockMatrixC, index_t BlockMatrixStrideC, class FloatC>

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
@@ -273,11 +273,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                    __syncthreads();
-#if 1
+#if 0
                    blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
 #elif 0
                    blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
-#elif 0
+#elif 1
                    blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
 #endif