Commit 313f3c07 authored by Jing Zhang's avatar Jing Zhang
Browse files

unroll k

parent 0f620a90
...@@ -385,9 +385,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -385,9 +385,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
void* a_loc = (void*)(p_a_block + mMyThreadOffsetA); void* a_loc = (void*)(p_a_block + mMyThreadOffsetA);
void* b_loc = (void*)(p_b_block + mMyThreadOffsetB); void* b_loc = (void*)(p_b_block + mMyThreadOffsetB);
// loop over k // loop over k
int k_chunk = 2; int k_chunk = K;
#pragma unroll //for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop * k_chunk)
for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop * k_chunk) index_t k_begin = 0;
{ {
#if 0 #if 0
...@@ -417,6 +417,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -417,6 +417,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
lgkmcnt(1); lgkmcnt(1);
outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]);
lgkmcnt(0); lgkmcnt(0);
#pragma unroll
for(int i = 0; i < k_chunk - 1; i++) for(int i = 0; i < k_chunk - 1; i++)
{ {
k = k + 1; k = k + 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment