Commit fe6f3744 authored by Jing Zhang's avatar Jing Zhang
Browse files

disable asm for debug

parent 46a0aec1
...@@ -413,8 +413,8 @@ int main(int argc, char* argv[]) ...@@ -413,8 +413,8 @@ int main(int argc, char* argv[])
{ {
#if 1 #if 1
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 64; constexpr index_t N = 16;
constexpr index_t C = 256; constexpr index_t C = 8;
constexpr index_t HI = 34; constexpr index_t HI = 34;
constexpr index_t WI = 34; constexpr index_t WI = 34;
constexpr index_t K = 128; constexpr index_t K = 128;
......
...@@ -46,7 +46,7 @@ __device__ void vmcnt(index_t cnt) ...@@ -46,7 +46,7 @@ __device__ void vmcnt(index_t cnt)
__device__ void lgkmcnt(index_t cnt) __device__ void lgkmcnt(index_t cnt)
{ {
#if !NO_LGKM_WAIT #if 0
if(cnt == 0) if(cnt == 0)
{ {
asm volatile("\n \ asm volatile("\n \
...@@ -86,6 +86,7 @@ __device__ void lgkmcnt(index_t cnt) ...@@ -86,6 +86,7 @@ __device__ void lgkmcnt(index_t cnt)
__device__ void outerProduct1x4(const float* a, const float* b, float* c) __device__ void outerProduct1x4(const float* a, const float* b, float* c)
{ {
#if 0
asm volatile("\n \ asm volatile("\n \
v_mac_f32 %0, %4, %5 \n \ v_mac_f32 %0, %4, %5 \n \
v_mac_f32 %1, %4, %6 \n \ v_mac_f32 %1, %4, %6 \n \
...@@ -102,6 +103,12 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c) ...@@ -102,6 +103,12 @@ __device__ void outerProduct1x4(const float* a, const float* b, float* c)
"1"(c[1]), "1"(c[1]),
"2"(c[2]), "2"(c[2]),
"3"(c[3])); "3"(c[3]));
#else
c[0] += a[0] * b[0];
c[1] += a[0] * b[1];
c[2] += a[0] * b[2];
c[3] += a[0] * b[3];
#endif
} }
__device__ void outerProduct1x4(const float& a, __device__ void outerProduct1x4(const float& a,
...@@ -197,7 +204,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a, ...@@ -197,7 +204,7 @@ __device__ void outerProduct8x8(const vector_type<float, 4>::MemoryType* a,
__device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0) __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, index_t offset = 0)
{ {
#if !NO_DS_READ #if 0
if(offset == 0) if(offset == 0)
{ {
asm volatile("\n \ asm volatile("\n \
...@@ -418,6 +425,9 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in ...@@ -418,6 +425,9 @@ __device__ void ds_read_b128(vector_type<float, 4>::MemoryType& r, void* lds, in
{ {
assert(false); assert(false);
} }
#else
using Float4 = vector_type<float, 4>::MemoryType;
r = ((Float4*)lds)[offset];
#endif #endif
} }
......
...@@ -435,35 +435,48 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -435,35 +435,48 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr index_t a_lds_cluster_col_stride = sizeof(Float) * MPerLevel1Cluster; constexpr index_t a_lds_cluster_col_stride = sizeof(Float) * MPerLevel1Cluster;
constexpr index_t b_lds_cluster_col_stride = sizeof(Float) * NPerLevel1Cluster; constexpr index_t b_lds_cluster_col_stride = sizeof(Float) * NPerLevel1Cluster;
ds_read_b128(reg_a[0], a_lds_loc, 0); for(index_t k = 0; k < K; k++)
ds_read_b128(reg_b[0], b_lds_loc, 0);
ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
lgkmcnt(2);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
lgkmcnt(1);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
#pragma unroll
for(index_t k = 1; k < K; ++k)
{ {
ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride); ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
lgkmcnt(1);
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride); ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride); ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride); ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
lgkmcnt(2);
outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
lgkmcnt(1);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
}
lgkmcnt(0);
outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
} }
//ds_read_b128(reg_a[0], a_lds_loc, 0);
//ds_read_b128(reg_b[0], b_lds_loc, 0);
//ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride);
//ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride);
//lgkmcnt(2);
//outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
//lgkmcnt(1);
//outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
//#pragma unroll
//for(index_t k = 1; k < K; ++k)
//{
//ds_read_b128(reg_a[0], a_lds_loc, k * a_lds_row_stride);
//lgkmcnt(1);
//outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
//ds_read_b128(reg_b[0], b_lds_loc, k * b_lds_row_stride);
//outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
//ds_read_b128(reg_b[1], b_lds_loc, b_lds_cluster_col_stride + k * b_lds_row_stride);
//ds_read_b128(reg_a[1], a_lds_loc, a_lds_cluster_col_stride + k * a_lds_row_stride);
//lgkmcnt(2);
//outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]);
//lgkmcnt(1);
//outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
//}
//lgkmcnt(0);
//outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]);
//outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]);
}
#endif #endif
template <class BlockMatrixC, index_t BlockMatrixStrideC, class FloatC> template <class BlockMatrixC, index_t BlockMatrixStrideC, class FloatC>
......
...@@ -273,11 +273,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn ...@@ -273,11 +273,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
__syncthreads(); __syncthreads();
#if 1 #if 0
blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread); blockwise_batch_gemm.Run(p_wei_block, p_in_block, p_out_thread);
#elif 0 #elif 0
blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread); blockwise_batch_gemm.Run_asm(p_wei_block, p_in_block, p_out_thread);
#elif 0 #elif 1
blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread); blockwise_batch_gemm.Run_asm_v2(p_wei_block, p_in_block, p_out_thread);
#endif #endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment