Commit d2cd5658 authored by Muhammed Ozturk's avatar Muhammed Ozturk
Browse files

Tensor Contraction Complex Data Type is working

parent 160cf6ed
...@@ -67,17 +67,17 @@ __device__ Complex ComplexAdd(Complex a, Complex b) ...@@ -67,17 +67,17 @@ __device__ Complex ComplexAdd(Complex a, Complex b)
} }
__global__ void kernel__1_1(float* dev_t3, __global__ void kernel__1_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y, int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually // when opt_pre_computed == -1, all indices will be calculated manually
...@@ -101,13 +101,17 @@ int size_internal) ...@@ -101,13 +101,17 @@ int size_internal)
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a; int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -154,12 +158,19 @@ int size_internal) ...@@ -154,12 +158,19 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -179,18 +190,18 @@ int size_internal) ...@@ -179,18 +190,18 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__2_1(float* dev_t3, __global__ void kernel__2_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y, int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
int internal_upperbound = 0; int internal_upperbound = 0;
...@@ -217,13 +228,17 @@ int size_internal) ...@@ -217,13 +228,17 @@ int size_internal)
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a; int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -274,12 +289,19 @@ int size_internal) ...@@ -274,12 +289,19 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -299,7 +321,7 @@ int size_internal) ...@@ -299,7 +321,7 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__3_1(Complex* dev_t3, __global__ void kernel__3_1(Complex* dev_t3,
Complex* dev_t2, Complex* dev_t2,
Complex* dev_v2, Complex* dev_v2,
...@@ -376,7 +398,7 @@ int size_internal) ...@@ -376,7 +398,7 @@ int size_internal)
for (int j = 0; j < 6; j++){ for (int j = 0; j < 6; j++){
reg_tile[i][j].re = 0.0; reg_tile[i][j].re = 0.0;
reg_tile[i][j].im reg_tile[i][j].im = 0.0;
} }
} }
...@@ -463,18 +485,18 @@ int size_internal) ...@@ -463,18 +485,18 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__4_1(float* dev_t3, __global__ void kernel__4_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y, int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
int internal_upperbound = 0; int internal_upperbound = 0;
...@@ -535,13 +557,17 @@ int size_internal) ...@@ -535,13 +557,17 @@ int size_internal)
rng_d = size_d % SIZE_SLICE_1_D; rng_d = size_d % SIZE_SLICE_1_D;
} }
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -592,12 +618,20 @@ int size_internal) ...@@ -592,12 +618,20 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -620,10 +654,10 @@ int size_internal) ...@@ -620,10 +654,10 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__1_tex_1(float* dev_t3, __global__ void kernel__1_tex_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2, int* dev_internal_offset_t2, int* dev_internal_offset_v2,
...@@ -631,8 +665,8 @@ int stride_reg_x, int stride_reg_y, ...@@ -631,8 +665,8 @@ int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually // when opt_pre_computed == -1, all indices will be calculated manually
...@@ -656,13 +690,17 @@ int size_internal) ...@@ -656,13 +690,17 @@ int size_internal)
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a; int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -709,12 +747,19 @@ int size_internal) ...@@ -709,12 +747,19 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -735,9 +780,9 @@ int size_internal) ...@@ -735,9 +780,9 @@ int size_internal)
} }
__global__ void kernel__2_tex_1(float* dev_t3, __global__ void kernel__2_tex_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2, int* dev_internal_offset_t2, int* dev_internal_offset_v2,
...@@ -745,8 +790,8 @@ int stride_reg_x, int stride_reg_y, ...@@ -745,8 +790,8 @@ int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
int internal_upperbound = 0; int internal_upperbound = 0;
...@@ -772,14 +817,18 @@ int size_internal) ...@@ -772,14 +817,18 @@ int size_internal)
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a; int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
Complex temp_av;
Complex temp_bv[6];
Complex reg_tile[6][6];
float temp_av; for (int i = 0; i < 6; i++){
float temp_bv[6]; for (int j = 0; j < 6; j++){
float reg_tile[6][6];
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -830,12 +879,19 @@ int size_internal) ...@@ -830,12 +879,19 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -855,10 +911,10 @@ int size_internal) ...@@ -855,10 +911,10 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__3_tex_1(float* dev_t3, __global__ void kernel__3_tex_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2, int* dev_internal_offset_t2, int* dev_internal_offset_v2,
...@@ -866,8 +922,8 @@ int stride_reg_x, int stride_reg_y, ...@@ -866,8 +922,8 @@ int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually // when opt_pre_computed == -1, all indices will be calculated manually
...@@ -925,13 +981,17 @@ int size_internal) ...@@ -925,13 +981,17 @@ int size_internal)
rng_d = size_d % SIZE_SLICE_1_D; rng_d = size_d % SIZE_SLICE_1_D;
} }
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -978,12 +1038,20 @@ int size_internal) ...@@ -978,12 +1038,20 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -1006,10 +1074,10 @@ int size_internal) ...@@ -1006,10 +1074,10 @@ int size_internal)
} }
} }
// created by tc_gen_code_Kernel() // Tensor Contraction Kernel
__global__ void kernel__4_tex_1(float* dev_t3, __global__ void kernel__4_tex_1(Complex* dev_t3,
float* dev_t2, Complex* dev_t2,
float* dev_v2, Complex* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2, int* dev_internal_offset_t2, int* dev_internal_offset_v2,
...@@ -1017,8 +1085,8 @@ int stride_reg_x, int stride_reg_y, ...@@ -1017,8 +1085,8 @@ int stride_reg_x, int stride_reg_y,
int size_internal) int size_internal)
{ {
// For Shared Memory, // For Shared Memory,
__shared__ float sm_a[16][96]; __shared__ Complex sm_a[16][96];
__shared__ float sm_b[16][96]; __shared__ Complex sm_b[16][96];
int internal_upperbound = 0; int internal_upperbound = 0;
...@@ -1079,13 +1147,17 @@ int size_internal) ...@@ -1079,13 +1147,17 @@ int size_internal)
rng_d = size_d % SIZE_SLICE_1_D; rng_d = size_d % SIZE_SLICE_1_D;
} }
float temp_av; Complex temp_av;
float temp_bv[6]; Complex temp_bv[6];
float reg_tile[6][6]; Complex reg_tile[6][6];
for (int i = 0; i < 6; i++) for (int i = 0; i < 6; i++){
for (int j = 0; j < 6; j++) for (int j = 0; j < 6; j++){
reg_tile[i][j] = 0.0;
reg_tile[i][j].re = 0.0;
reg_tile[i][j].im = 0.0;
}
}
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+='] // tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1 #pragma unroll 1
...@@ -1136,12 +1208,19 @@ int size_internal) ...@@ -1136,12 +1208,19 @@ int size_internal)
{ {
temp_av = sm_a[ll][idx_a + (xx * 16)]; temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0]; // reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1]; // reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2]; // reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3]; // reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4]; // reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5]; // reg_tile[5][xx] += temp_av * temp_bv[5];
reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
} }
} }
__syncthreads(); __syncthreads();
...@@ -1166,27 +1245,27 @@ int size_internal) ...@@ -1166,27 +1245,27 @@ int size_internal)
extern "C" extern "C"
void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* host_t2, float* host_v2, int cond_kernel_1, int opt_register_transpose) void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, Complex* t3, Complex* host_t2, Complex* host_v2, int cond_kernel_1, int opt_register_transpose)
{ {
int num_thread_blocks_kernel_1; int num_thread_blocks_kernel_1;
float* dev_t3; Complex* dev_t3;
float* dev_t2; Complex* dev_t2;
float* dev_v2; Complex* dev_v2;
int* host_internal_left_offset; int* host_internal_left_offset;
int* host_internal_right_offset; int* host_internal_right_offset;
num_thread_blocks_kernel_1 = CEIL(size_a, SIZE_SLICE_1_A) * CEIL(size_b, SIZE_SLICE_1_B) * CEIL(size_c, SIZE_SLICE_1_C) * CEIL(size_d, SIZE_SLICE_1_D); num_thread_blocks_kernel_1 = CEIL(size_a, SIZE_SLICE_1_A) * CEIL(size_b, SIZE_SLICE_1_B) * CEIL(size_c, SIZE_SLICE_1_C) * CEIL(size_d, SIZE_SLICE_1_D);
// hipMalloc() // hipMalloc()
hipMalloc((void**) &dev_t3, sizeof(float) * size_a * size_b * size_c * size_d); hipMalloc((void**) &dev_t3, sizeof(Complex) * size_a * size_b * size_c * size_d);
hipMalloc((void**) &dev_t2, sizeof(float) * size_f * size_b * size_e * size_a); hipMalloc((void**) &dev_t2, sizeof(Complex) * size_f * size_b * size_e * size_a);
hipMalloc((void**) &dev_v2, sizeof(float) * size_e * size_c * size_f * size_d); hipMalloc((void**) &dev_v2, sizeof(Complex) * size_e * size_c * size_f * size_d);
// hipMemcpy() // hipMemcpy()
hipMemcpy(dev_t3, t3, sizeof(float) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice); hipMemcpy(dev_t3, t3, sizeof(Complex) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice);
hipMemcpy(dev_t2, host_t2, sizeof(float) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice); hipMemcpy(dev_t2, host_t2, sizeof(Complex) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice);
hipMemcpy(dev_v2, host_v2, sizeof(float) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice); hipMemcpy(dev_v2, host_v2, sizeof(Complex) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice);
// Related to Kernels // Related to Kernels
// There are 1 Basic Kernels // There are 1 Basic Kernels
...@@ -1302,7 +1381,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, ...@@ -1302,7 +1381,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e,
} }
// Copy the Result from Device to Host // Copy the Result from Device to Host
hipMemcpy(t3, dev_t3, sizeof(float) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost); hipMemcpy(t3, dev_t3, sizeof(Complex) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost);
// hipFree() // hipFree()
hipFree(dev_t3); hipFree(dev_t2); hipFree(dev_v2); hipFree(dev_t3); hipFree(dev_t2); hipFree(dev_v2);
...@@ -1315,7 +1394,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, ...@@ -1315,7 +1394,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e,
// This is written by tc_interface.tc_gen_code_interface() // This is written by tc_interface.tc_gen_code_interface()
// This Interface Should be Called to Run the Kernels // This Interface Should be Called to Run the Kernels
extern "C" extern "C"
void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* t2, float* v2, int cond_kernel_1, int opt_register_transpose) void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, Complex* t3, Complex* t2, Complex* v2, int cond_kernel_1, int opt_register_transpose)
{ {
// Pre-Processing for Split // Pre-Processing for Split
// Based on Tile-Sizes and Problem-Size // Based on Tile-Sizes and Problem-Size
......
// //
// Sample Code: // Sample Code:
// //
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "4D_kernel.hpp" #include "4D_kernel.hpp"
//#define DEBUG_CORRECTNESS // #define DEBUG_CORRECTNESS
//#define DEBUG_SIMPLE_CORRECTNESS //#define DEBUG_SIMPLE_CORRECTNESS
void pre_Initializing_Input_Tensors(); void pre_Initializing_Input_Tensors();
void post_Correctness(); void post_Correctness();
// Initialize t3 (t3_temp), 9 t2 and 9 v2. // Initialize t3 (t3_temp), 9 t2 and 9 v2.
void pre_Initializing_Input_Tensors(float* h_C, float* h_C_chk, int size_C, float* h_A, int size_A, float* h_B, int size_B) void pre_Initializing_Input_Tensors(Complex* h_C, Complex* h_C_chk, int size_C, Complex* h_A, int size_A, Complex* h_B, int size_B)
{ {
// t3 // t3
int i, j; int i, j;
for (i = 0; i < size_C; i++) for (i = 0; i < size_C; i++)
{ {
h_C[i] = 0.0; h_C[i].re = 0.0;
h_C_chk[i] = 0.0; h_C_chk[i].re = 0.0;
h_C[i].im = 0.0;
h_C_chk[i].im = 0.0;
} }
for (j = 0; j < size_A; j++) for (j = 0; j < size_A; j++)
{ {
h_A[j] = ((float)rand() / RAND_MAX); h_A[j].re = ((float)rand() / RAND_MAX);
h_A[j].im = ((float)rand() / RAND_MAX);
} }
for (j = 0; j < size_B; j++) for (j = 0; j < size_B; j++)
{ {
h_B[j] = ((float)rand() / RAND_MAX); h_B[j].re = ((float)rand() / RAND_MAX);
h_B[j].im = ((float)rand() / RAND_MAX);
} }
} }
// //
void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f) void post_Correctness(Complex* h_C, Complex* h_C_chk, Complex* h_A, Complex* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f)
{ {
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e]; // t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
int size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d; int size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
...@@ -59,8 +66,18 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si ...@@ -59,8 +66,18 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
{ {
for (idx_f = 0; idx_f < size_idx_f; idx_f++) for (idx_f = 0; idx_f < size_idx_f; idx_f++)
{ {
h_C_chk[tmp_r_idx] += h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a] * h_C_chk[tmp_r_idx].re += (h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].re *
h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d]; h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].re)
-(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].im *
h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].im);
h_C_chk[tmp_r_idx].im += (h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].re *
h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].im)
+(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].im *
h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].re);
ops++; ops++;
} }
tmp_ops = tmp_ops + ops; tmp_ops = tmp_ops + ops;
...@@ -68,28 +85,51 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si ...@@ -68,28 +85,51 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
} }
printf ("======================================= Correctness Check ==========================================\n"); printf ("======================================= Correctness Check ==========================================\n");
float epsilon = 0.00000001;
int diff = 0; float epsilon = 0.01;
int same = 0;
int diff_re = 0;
int diff_im = 0;
int same_re = 0;
int same_im = 0;
int i; int i;
for (i = 0; i < size_C; i++) for (i = 0; i < size_C; i++)
{ {
float check = h_C_chk[i] - h_C[i]; float check_re = h_C_chk[i].re - h_C[i].re;
if (check < 0) check *= -1; float check_im = h_C_chk[i].im - h_C[i].im;
if (check > epsilon)
if (check_re < 0) check_re *= -1;
if (check_re > epsilon)
{
diff_re++;
if (diff_re < 8)
printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i].re, h_C[i].re, check_re);
}
else
{
same_re++;
}
if (check_im < 0) check_im *= -1;
if (check_im > epsilon)
{ {
diff++; diff_im++;
if (diff < 8) if (diff_im < 8)
printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i], h_C[i], check); printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i].im, h_C[i].im, check_im);
} }
else else
{ {
same++; same_im++;
} }
} }
printf (" >>> PASSED: %'10d among %'10d in t3\n", same, size_C); printf (" >>> PASSED on Re: %'10d among %'10d in t3\n", same_re, size_C);
printf (" >>> ERROR : %'10d among %'10d in t3\n", diff, size_C); printf (" >>> PASSED on Im: %'10d among %'10d in t3\n", same_im, size_C);
printf (" >>> ERROR on Re : %'10d among %'10d in t3\n", diff_re, size_C);
printf (" >>> ERROR on Im : %'10d among %'10d in t3\n", diff_im, size_C);
printf (" >>> Total Operations: %'lld\n", tmp_ops * 2); printf (" >>> Total Operations: %'lld\n", tmp_ops * 2);
printf ("====================================================================================================\n"); printf ("====================================================================================================\n");
} }
...@@ -101,9 +141,9 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si ...@@ -101,9 +141,9 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
// for sd2 // for sd2
float *host_C, *host_C_chk; Complex *host_C, *host_C_chk;
float *host_A; Complex *host_A;
float *host_B; Complex *host_B;
int size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f; int size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f;
// Problem Size // Problem Size
...@@ -137,10 +177,10 @@ int main(int argc, char** argv) ...@@ -137,10 +177,10 @@ int main(int argc, char** argv)
size_B = size_idx_d * size_idx_f * size_idx_c * size_idx_e; size_B = size_idx_d * size_idx_f * size_idx_c * size_idx_e;
// //
host_C = (float*)malloc(sizeof(float) * size_C); host_C = (Complex*)malloc(sizeof(Complex) * size_C);
host_C_chk = (float*)malloc(sizeof(float) * size_C); host_C_chk = (Complex*)malloc(sizeof(Complex) * size_C);
host_A = (float*)malloc(sizeof(float) * size_A); host_A = (Complex*)malloc(sizeof(Complex) * size_A);
host_B = (float*)malloc(sizeof(float) * size_B); host_B = (Complex*)malloc(sizeof(Complex) * size_B);
printf ("==========================================================================================================\n"); printf ("==========================================================================================================\n");
printf (">>> abcd-aebf-dfce\n"); printf (">>> abcd-aebf-dfce\n");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment