Tensor Contraction Complex Data Type is working

d2cd5658 · Muhammed Ozturk · 160cf6ed · d2cd5658 · d2cd5658
Commit d2cd5658 authored Oct 27, 2023 by Muhammed Ozturk
Showing with 286 additions and 167 deletions

example/complex_contraction/4D_kernel.hpp example/complex_contraction/4D_kernel.hpp +218 -139

example/complex_contraction/main.cpp example/complex_contraction/main.cpp +68 -28

No files found.
--- a/example/complex_contraction/4D_kernel.hpp
+++ b/example/complex_contraction/4D_kernel.hpp
@@ -67,17 +67,17 @@ __device__ Complex ComplexAdd(Complex a, Complex b)
 }


-__global__ void kernel__1_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+__global__ void kernel__1_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int stride_reg_x, int stride_reg_y, 
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	// when opt_pre_computed == -1, all indices will be calculated manually
@@ -101,13 +101,17 @@ int size_internal)
 	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;


-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -154,12 +158,19 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
 			}
 		}
 		__syncthreads();
@@ -179,18 +190,18 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
-__global__ void kernel__2_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+// Tensor Contraction Kernel
+__global__ void kernel__2_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int stride_reg_x, int stride_reg_y, 
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	int internal_upperbound   = 0;
@@ -217,13 +228,17 @@ int size_internal)
 	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;


-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -274,12 +289,19 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
 			}
 		}
 		__syncthreads();
@@ -299,7 +321,7 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
+// Tensor Contraction Kernel
 __global__ void kernel__3_1(Complex* dev_t3, 
 Complex* dev_t2, 
 Complex* dev_v2, 
@@ -376,7 +398,7 @@ int size_internal)
 		for (int j = 0; j < 6; j++){

 			reg_tile[i][j].re = 0.0;
-			reg_tile[i][j].im
+			reg_tile[i][j].im = 0.0;
 		}
 	}
 		
@@ -463,18 +485,18 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
-__global__ void kernel__4_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+// Tensor Contraction Kernel
+__global__ void kernel__4_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int stride_reg_x, int stride_reg_y, 
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	int internal_upperbound   = 0;
@@ -535,13 +557,17 @@ int size_internal)
 		rng_d = size_d % SIZE_SLICE_1_D;
 	}

-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -592,12 +618,20 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
+
 			}
 		}
 		__syncthreads();
@@ -620,10 +654,10 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
-__global__ void kernel__1_tex_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+// Tensor Contraction Kernel
+__global__ void kernel__1_tex_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
@@ -631,8 +665,8 @@ int stride_reg_x, int stride_reg_y,
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	// when opt_pre_computed == -1, all indices will be calculated manually
@@ -656,13 +690,17 @@ int size_internal)
 	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;


-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -709,12 +747,19 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
 			}
 		}
 		__syncthreads();
@@ -735,9 +780,9 @@ int size_internal)
 }


-__global__ void kernel__2_tex_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+__global__ void kernel__2_tex_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
@@ -745,8 +790,8 @@ int stride_reg_x, int stride_reg_y,
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	int internal_upperbound   = 0;
@@ -772,14 +817,18 @@ int size_internal)

 	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;

+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -830,12 +879,19 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
 			}
 		}
 		__syncthreads();
@@ -855,10 +911,10 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
-__global__ void kernel__3_tex_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+// Tensor Contraction Kernel
+__global__ void kernel__3_tex_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
@@ -866,8 +922,8 @@ int stride_reg_x, int stride_reg_y,
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	// when opt_pre_computed == -1, all indices will be calculated manually
@@ -925,13 +981,17 @@ int size_internal)
 		rng_d = size_d % SIZE_SLICE_1_D;
 	}

-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -978,12 +1038,20 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
+
 			}
 		}
 		__syncthreads();
@@ -1006,10 +1074,10 @@ int size_internal)
 	}
 }

-// created by tc_gen_code_Kernel()
-__global__ void kernel__4_tex_1(float* dev_t3, 
-float* dev_t2, 
-float* dev_v2, 
+// Tensor Contraction Kernel
+__global__ void kernel__4_tex_1(Complex* dev_t3, 
+Complex* dev_t2, 
+Complex* dev_v2, 
 int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
 int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
 int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
@@ -1017,8 +1085,8 @@ int stride_reg_x, int stride_reg_y,
 int size_internal)
 {
 	// For Shared Memory,
-	__shared__ float sm_a[16][96];
-	__shared__ float sm_b[16][96];
+	__shared__ Complex sm_a[16][96];
+	__shared__ Complex sm_b[16][96];


 	int internal_upperbound   = 0;
@@ -1079,13 +1147,17 @@ int size_internal)
 		rng_d = size_d % SIZE_SLICE_1_D;
 	}

-	float temp_av;
-	float temp_bv[6];
-	float reg_tile[6][6];
+	Complex temp_av;
+	Complex temp_bv[6];
+	Complex reg_tile[6][6];

-	for (int i = 0; i < 6; i++)
-	for (int j = 0; j < 6; j++)
-	reg_tile[i][j] = 0.0;
+	for (int i = 0; i < 6; i++){
+		for (int j = 0; j < 6; j++){
+
+			reg_tile[i][j].re = 0.0;
+			reg_tile[i][j].im = 0.0;
+		}
+	}

 	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
 	#pragma unroll 1
@@ -1136,12 +1208,19 @@ int size_internal)
 			{
 				temp_av = sm_a[ll][idx_a + (xx * 16)];

-				reg_tile[0][xx] += temp_av * temp_bv[0];
-				reg_tile[1][xx] += temp_av * temp_bv[1];
-				reg_tile[2][xx] += temp_av * temp_bv[2];
-				reg_tile[3][xx] += temp_av * temp_bv[3];
-				reg_tile[4][xx] += temp_av * temp_bv[4];
-				reg_tile[5][xx] += temp_av * temp_bv[5];
+				// reg_tile[0][xx] += temp_av * temp_bv[0];
+				// reg_tile[1][xx] += temp_av * temp_bv[1];
+				// reg_tile[2][xx] += temp_av * temp_bv[2];
+				// reg_tile[3][xx] += temp_av * temp_bv[3];
+				// reg_tile[4][xx] += temp_av * temp_bv[4];
+				// reg_tile[5][xx] += temp_av * temp_bv[5];
+
+				reg_tile[0][xx] = ComplexAdd(reg_tile[0][xx] , ComplexMul(temp_av, temp_bv[0] )) ;
+				reg_tile[1][xx] = ComplexAdd(reg_tile[1][xx] , ComplexMul(temp_av, temp_bv[1] )) ;
+				reg_tile[2][xx] = ComplexAdd(reg_tile[2][xx] , ComplexMul(temp_av, temp_bv[2] )) ;
+				reg_tile[3][xx] = ComplexAdd(reg_tile[3][xx] , ComplexMul(temp_av, temp_bv[3] )) ;
+				reg_tile[4][xx] = ComplexAdd(reg_tile[4][xx] , ComplexMul(temp_av, temp_bv[4] )) ;
+				reg_tile[5][xx] = ComplexAdd(reg_tile[5][xx] , ComplexMul(temp_av, temp_bv[5] )) ;
 			}
 		}
 		__syncthreads();
@@ -1166,27 +1245,27 @@ int size_internal)


 extern "C"
-void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* host_t2, float* host_v2, int cond_kernel_1, int opt_register_transpose)
+void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, Complex* t3, Complex* host_t2, Complex* host_v2, int cond_kernel_1, int opt_register_transpose)
 {
 	int num_thread_blocks_kernel_1;

-	float* dev_t3;
-	float* dev_t2;
-	float* dev_v2;
+	Complex* dev_t3;
+	Complex* dev_t2;
+	Complex* dev_v2;

 	int* host_internal_left_offset;
 	int* host_internal_right_offset;

 	num_thread_blocks_kernel_1 = CEIL(size_a, SIZE_SLICE_1_A) * CEIL(size_b, SIZE_SLICE_1_B) * CEIL(size_c, SIZE_SLICE_1_C) * CEIL(size_d, SIZE_SLICE_1_D);
 	// hipMalloc()
-	hipMalloc((void**) &dev_t3, sizeof(float) * size_a * size_b * size_c * size_d);
-	hipMalloc((void**) &dev_t2, sizeof(float) * size_f * size_b * size_e * size_a);
-	hipMalloc((void**) &dev_v2, sizeof(float) * size_e * size_c * size_f * size_d);
+	hipMalloc((void**) &dev_t3, sizeof(Complex) * size_a * size_b * size_c * size_d);
+	hipMalloc((void**) &dev_t2, sizeof(Complex) * size_f * size_b * size_e * size_a);
+	hipMalloc((void**) &dev_v2, sizeof(Complex) * size_e * size_c * size_f * size_d);

 	// hipMemcpy()
-	hipMemcpy(dev_t3, t3, sizeof(float) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice);
-	hipMemcpy(dev_t2, host_t2, sizeof(float) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice);
-	hipMemcpy(dev_v2, host_v2, sizeof(float) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice);
+	hipMemcpy(dev_t3, t3, sizeof(Complex) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice);
+	hipMemcpy(dev_t2, host_t2, sizeof(Complex) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice);
+	hipMemcpy(dev_v2, host_v2, sizeof(Complex) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice);

 	// Related to Kernels
 	// There are 1 Basic Kernels
@@ -1302,7 +1381,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e,
 	}

 	// Copy the Result from Device to Host
-	hipMemcpy(t3, dev_t3, sizeof(float) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost);
+	hipMemcpy(t3, dev_t3, sizeof(Complex) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost);

 	// hipFree()
 	hipFree(dev_t3);	hipFree(dev_t2);	hipFree(dev_v2);
@@ -1315,7 +1394,7 @@ void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e,
 // This is written by tc_interface.tc_gen_code_interface()
 // This Interface Should be Called to Run the Kernels
 extern "C"
-void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* t2, float* v2, int cond_kernel_1, int opt_register_transpose)
+void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, Complex* t3, Complex* t2, Complex* v2, int cond_kernel_1, int opt_register_transpose)
 {
 	// Pre-Processing for Split
 	// Based on Tile-Sizes and Problem-Size

--- a/example/complex_contraction/main.cpp
+++ b/example/complex_contraction/main.cpp
 //
 //	Sample Code:
 //
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "4D_kernel.hpp"

-//#define DEBUG_CORRECTNESS
+// #define DEBUG_CORRECTNESS
 //#define DEBUG_SIMPLE_CORRECTNESS

 void pre_Initializing_Input_Tensors();
 void post_Correctness();

 // Initialize t3 (t3_temp), 9 t2 and 9 v2.
-void pre_Initializing_Input_Tensors(float* h_C, float* h_C_chk, int size_C, float* h_A, int size_A, float* h_B, int size_B)
+void pre_Initializing_Input_Tensors(Complex* h_C, Complex* h_C_chk, int size_C, Complex* h_A, int size_A, Complex* h_B, int size_B)
 {
 	// t3
 	int i, j;
 	for (i = 0; i < size_C; i++)
 	{
-		h_C[i] 	= 0.0;
-		h_C_chk[i] = 0.0;
+		h_C[i].re 	= 0.0;
+		h_C_chk[i].re = 0.0;
+
+		h_C[i].im 	= 0.0;
+		h_C_chk[i].im = 0.0;
 	}

 	for (j = 0; j < size_A; j++)
 	{
-		h_A[j] = ((float)rand() / RAND_MAX);
+		h_A[j].re = ((float)rand() / RAND_MAX);
+		h_A[j].im = ((float)rand() / RAND_MAX);
+
 	}

 	for (j = 0; j < size_B; j++)
 	{
-		h_B[j] = ((float)rand() / RAND_MAX);
+		h_B[j].re = ((float)rand() / RAND_MAX);
+		h_B[j].im = ((float)rand() / RAND_MAX);
 	}
 }

 //
-void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f)
+void post_Correctness(Complex* h_C, Complex* h_C_chk, Complex* h_A, Complex* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f)
 {
    //  t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
    int size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
@@ -59,8 +66,18 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
        {
            for (idx_f = 0; idx_f < size_idx_f; idx_f++)
            {   
-                h_C_chk[tmp_r_idx] += 	h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a] * 
-                                        h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d];
+                h_C_chk[tmp_r_idx].re += 	(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].re * 
+                                        	h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].re)
+											-(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].im * 
+											h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].im);
+
+
+				h_C_chk[tmp_r_idx].im += 	(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].re * 
+											h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].im)
+
+											+(h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a].im *
+											h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d].re);
+
                ops++;
            }
            tmp_ops = tmp_ops + ops;
@@ -68,28 +85,51 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
 	}

 	printf ("======================================= Correctness Check ==========================================\n");
-	float   epsilon = 0.00000001;
-	int      diff    = 0;
-	int      same    = 0;
+	
+	float   epsilon = 0.01;
+	
+	int      diff_re    = 0;
+	int      diff_im    = 0;
+
+	int      same_re    = 0;
+	int      same_im    = 0;
 	int 	 i;
+
 	for (i = 0; i < size_C; i++)
 	{
-		float check = h_C_chk[i] - h_C[i];
-		if (check < 0) check *= -1;
-		if (check > epsilon)
+		float check_re = h_C_chk[i].re - h_C[i].re;
+		float check_im = h_C_chk[i].im - h_C[i].im;
+
+
+		if (check_re < 0) check_re *= -1;
+		if (check_re > epsilon)
+		{
+			diff_re++;
+			if (diff_re < 8)
+			printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i].re, h_C[i].re, check_re);
+		}
+		else
+		{
+			same_re++;
+		}
+
+		if (check_im < 0) check_im *= -1;
+		if (check_im > epsilon)
 		{
-			diff++;
-			if (diff < 8)
-			printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i], h_C[i], check);
+			diff_im++;
+			if (diff_im < 8)
+			printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i].im, h_C[i].im, check_im);
 		}
 		else
 		{
-			same++;
+			same_im++;
 		}
 	}

-	printf (" >>> PASSED: %'10d among %'10d in t3\n", same, size_C);
-	printf (" >>> ERROR : %'10d among %'10d in t3\n", diff, size_C);
+	printf (" >>> PASSED on Re: %'10d among %'10d in t3\n", same_re, size_C);
+	printf (" >>> PASSED on Im: %'10d among %'10d in t3\n", same_im, size_C);
+	printf (" >>> ERROR on Re : %'10d among %'10d in t3\n", diff_re, size_C);
+	printf (" >>> ERROR on Im : %'10d among %'10d in t3\n", diff_im, size_C);
 	printf (" >>> Total Operations: %'lld\n", tmp_ops * 2);
 	printf ("====================================================================================================\n");
 }
@@ -101,9 +141,9 @@ void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int si
 int main(int argc, char** argv)
 {
 	// for sd2
-	float *host_C, *host_C_chk;
-	float *host_A;
-	float *host_B;
+	Complex *host_C, *host_C_chk;
+	Complex *host_A;
+	Complex *host_B;
 	int size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f;

 	// Problem Size
@@ -137,10 +177,10 @@ int main(int argc, char** argv)
 	size_B = size_idx_d * size_idx_f * size_idx_c * size_idx_e;

    //
-	host_C 		= (float*)malloc(sizeof(float) * size_C);
-	host_C_chk 	= (float*)malloc(sizeof(float) * size_C);
-	host_A 		= (float*)malloc(sizeof(float) * size_A);
-	host_B 		= (float*)malloc(sizeof(float) * size_B);
+	host_C 		= (Complex*)malloc(sizeof(Complex) * size_C);
+	host_C_chk 	= (Complex*)malloc(sizeof(Complex) * size_C);
+	host_A 		= (Complex*)malloc(sizeof(Complex) * size_A);
+	host_B 		= (Complex*)malloc(sizeof(Complex) * size_B);
 	
 	printf ("==========================================================================================================\n");
    printf (">>> abcd-aebf-dfce\n");