complex type contraction

f0a8ee84 · Muhammed Ozturk · 59136091 · f0a8ee84 · f0a8ee84 · f0a8ee84
Commit f0a8ee84 authored Oct 10, 2023 by Muhammed Ozturk
3 changed files
--- a/example/complex_contraction/4D_kernel.hpp
+++ b/example/complex_contraction/4D_kernel.hpp
+// created by tc_code_include() in tc_code_include.py
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <locale.h>
+#include <algorithm>
+using namespace std;
+// created by tc_gen_definition_new()
+#define SIZE_SLICE_1_E 16
+#define SIZE_SLICE_1_F 1
+#define SIZE_SLICE_1_A 16
+#define SIZE_SLICE_1_B 6
+#define SIZE_SLICE_1_D 16
+#define SIZE_SLICE_1_C 6
+#define SIZE_INT_UNIT_1 SIZE_SLICE_1_E * SIZE_SLICE_1_F
+#define SIZE_TB_1_X 	SIZE_SLICE_1_A
+#define SIZE_TB_1_Y 	SIZE_SLICE_1_D
+#define SIZE_REG_1_X 	SIZE_SLICE_1_B
+#define SIZE_REG_1_Y 	SIZE_SLICE_1_C
+#define NUM_INDEX 		4
+#define CEIL(a, b) 		(((a) + (b) - 1) / (b))
+// Not Yet: Multiple Tensor Contractions.
+// |Constant Memory| = 64KB, 16K Words(Integer), which means |K| <= 8192
+#define MAX_CONST_LEN 		8192
+__constant__ int const_internal_t2_offset[MAX_CONST_LEN];
+__constant__ int const_internal_v2_offset[MAX_CONST_LEN];
+// created by tc_gen_code_Kernel()
+__global__ void kernel__1_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		// No Need to Put Boundary-Checks before For-Statement: : 
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		// No Need to Put Boundary-Checks before For-Statement: : 
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	#pragma unroll 6
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__2_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	int internal_upperbound   = 0;
+	int internal_offset;
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		// Part: Generalized Contraction Index (p7b)
+		internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
+		if (internal_offset > 0) internal_upperbound = internal_offset;
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	#pragma unroll 6
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__3_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	// need to support partial tiles
+	int rng_a, rng_b, rng_c, rng_d;
+	if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
+	{
+		rng_a = SIZE_SLICE_1_A;
+	}
+	else
+	{
+		rng_a = size_a % SIZE_SLICE_1_A;
+	}
+	if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
+	{
+		rng_b = SIZE_SLICE_1_B;
+	}
+	else
+	{
+		rng_b = size_b % SIZE_SLICE_1_B;
+	}
+	if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
+	{
+		rng_c = SIZE_SLICE_1_C;
+	}
+	else
+	{
+		rng_c = size_c % SIZE_SLICE_1_C;
+	}
+	if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
+	{
+		rng_d = SIZE_SLICE_1_D;
+	}
+	else
+	{
+		rng_d = size_d % SIZE_SLICE_1_D;
+	}
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_a)
+		for (int ll = 0; ll < rng_b; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_d)
+		for (int ll = 0; ll < rng_c; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	if (idx_a < rng_a && idx_d < rng_d)
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			if(i < rng_c && j < rng_b)
+			{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+			}
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__4_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	int internal_upperbound   = 0;
+	int internal_offset;
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	// need to support partial tiles
+	int rng_a, rng_b, rng_c, rng_d;
+	if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
+	{
+		rng_a = SIZE_SLICE_1_A;
+	}
+	else
+	{
+		rng_a = size_a % SIZE_SLICE_1_A;
+	}
+	if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
+	{
+		rng_b = SIZE_SLICE_1_B;
+	}
+	else
+	{
+		rng_b = size_b % SIZE_SLICE_1_B;
+	}
+	if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
+	{
+		rng_c = SIZE_SLICE_1_C;
+	}
+	else
+	{
+		rng_c = size_c % SIZE_SLICE_1_C;
+	}
+	if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
+	{
+		rng_d = SIZE_SLICE_1_D;
+	}
+	else
+	{
+		rng_d = size_d % SIZE_SLICE_1_D;
+	}
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		// Part: Generalized Contraction Index (p7b)
+		internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
+		if (internal_offset > 0) internal_upperbound = internal_offset;
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_a && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < rng_b; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_d && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < rng_c; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	if (idx_a < rng_a && idx_d < rng_d)
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			if(i < rng_c && j < rng_b)
+			{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+			}
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__1_tex_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		// No Need to Put Boundary-Checks before For-Statement: : 
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		// No Need to Put Boundary-Checks before For-Statement: : 
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	#pragma unroll 6
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__2_tex_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	int internal_upperbound   = 0;
+	int internal_offset;
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		// Part: Generalized Contraction Index (p7b)
+		internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
+		if (internal_offset > 0) internal_upperbound = internal_offset;
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < 6; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	#pragma unroll 6
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__3_tex_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	// need to support partial tiles
+	int rng_a, rng_b, rng_c, rng_d;
+	if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
+	{
+		rng_a = SIZE_SLICE_1_A;
+	}
+	else
+	{
+		rng_a = size_a % SIZE_SLICE_1_A;
+	}
+	if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
+	{
+		rng_b = SIZE_SLICE_1_B;
+	}
+	else
+	{
+		rng_b = size_b % SIZE_SLICE_1_B;
+	}
+	if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
+	{
+		rng_c = SIZE_SLICE_1_C;
+	}
+	else
+	{
+		rng_c = size_c % SIZE_SLICE_1_C;
+	}
+	if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
+	{
+		rng_d = SIZE_SLICE_1_D;
+	}
+	else
+	{
+		rng_d = size_d % SIZE_SLICE_1_D;
+	}
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_a)
+		for (int ll = 0; ll < rng_b; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_d)
+		for (int ll = 0; ll < rng_c; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	if (idx_a < rng_a && idx_d < rng_d)
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			if(i < rng_c && j < rng_b)
+			{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+			}
+		}
+	}
+}
+// created by tc_gen_code_Kernel()
+__global__ void kernel__4_tex_1(float* dev_t3, 
+float* dev_t2, 
+float* dev_v2, 
+int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, 
+int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d, 
+int* dev_internal_offset_t2, int* dev_internal_offset_v2, 
+int stride_reg_x, int stride_reg_y, 
+int size_internal)
+{
+	// For Shared Memory,
+	__shared__ float sm_a[16][96];
+	__shared__ float sm_b[16][96];
+	int internal_upperbound   = 0;
+	int internal_offset;
+	// when opt_pre_computed == -1, all indices will be calculated manually
+	// # of indices mapped on TB_X: 1
+	// # of indices mapped on TB_Y: 1
+	int idx_a = threadIdx.x;
+	int idx_d = threadIdx.y;
+	int tmp_blkIdx;
+	int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
+	tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
+	int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
+	tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
+	int blk_idx_b = tmp_blkIdx / numBlk_a;
+	tmp_blkIdx = tmp_blkIdx % (numBlk_a);
+	int  blk_idx_a = tmp_blkIdx;
+	int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
+	// need to support partial tiles
+	int rng_a, rng_b, rng_c, rng_d;
+	if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
+	{
+		rng_a = SIZE_SLICE_1_A;
+	}
+	else
+	{
+		rng_a = size_a % SIZE_SLICE_1_A;
+	}
+	if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
+	{
+		rng_b = SIZE_SLICE_1_B;
+	}
+	else
+	{
+		rng_b = size_b % SIZE_SLICE_1_B;
+	}
+	if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
+	{
+		rng_c = SIZE_SLICE_1_C;
+	}
+	else
+	{
+		rng_c = size_c % SIZE_SLICE_1_C;
+	}
+	if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
+	{
+		rng_d = SIZE_SLICE_1_D;
+	}
+	else
+	{
+		rng_d = size_d % SIZE_SLICE_1_D;
+	}
+	float temp_av;
+	float temp_bv[6];
+	float reg_tile[6][6];
+	for (int i = 0; i < 6; i++)
+	for (int j = 0; j < 6; j++)
+	reg_tile[i][j] = 0.0;
+	// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
+	#pragma unroll 1
+	for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
+	{
+		// Part: Generalized Contraction Index (p7b)
+		internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
+		if (internal_offset > 0) internal_upperbound = internal_offset;
+		//---------------------------------------------------------------------------------------------------
+		// This is for the new version
+		// This Part is for Loading Input-Left
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_a && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < rng_b; ll++)
+		{
+			// ['a', 'e', 'b', 'f']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_a
+			sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
+		}
+		// This Part is for Loading Input-Right
+		// tc_gen_code_Kernel_Load_Inputs_Abstracts()
+		if (idx_a < rng_d && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
+		for (int ll = 0; ll < rng_c; ll++)
+		{
+			// ['d', 'f', 'c', 'e']
+			// Exception: Temp. version!: threadIdx.y + l
+			// Exception: Temp. version!: idx_a < rng_d
+			sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
+		}
+		__syncthreads();
+		//---------------------------------------------------------------------------------------------------
+		// Part: Generalized Threads
+		for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
+		{
+			temp_bv[0] = sm_b[ll][idx_d + 0];
+			temp_bv[1] = sm_b[ll][idx_d + 16];
+			temp_bv[2] = sm_b[ll][idx_d + 32];
+			temp_bv[3] = sm_b[ll][idx_d + 48];
+			temp_bv[4] = sm_b[ll][idx_d + 64];
+			temp_bv[5] = sm_b[ll][idx_d + 80];
+			for (int xx = 0; xx < 6; xx++) // (1)
+			{
+				temp_av = sm_a[ll][idx_a + (xx * 16)];
+				reg_tile[0][xx] += temp_av * temp_bv[0];
+				reg_tile[1][xx] += temp_av * temp_bv[1];
+				reg_tile[2][xx] += temp_av * temp_bv[2];
+				reg_tile[3][xx] += temp_av * temp_bv[3];
+				reg_tile[4][xx] += temp_av * temp_bv[4];
+				reg_tile[5][xx] += temp_av * temp_bv[5];
+			}
+		}
+		__syncthreads();
+	}
+	// Store Results (Registers) to Global Memory
+	// Part: Generalized Threads
+	// Part: Generalized Register-Tiling
+	if (idx_a < rng_a && idx_d < rng_d)
+	for (int i = 0; i < 6; i++)
+	{
+		for (int j = 0; j < 6; j++)
+		{
+			if(i < rng_c && j < rng_b)
+			{
+			dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
+			}
+		}
+	}
+}
+// written by tc_interface.tc_gen_code_interface_Header()
+extern "C"
+void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* host_t2, float* host_v2, int cond_kernel_1, int opt_register_transpose)
+{
+	int num_thread_blocks_kernel_1;
+	float* dev_t3;
+	float* dev_t2;
+	float* dev_v2;
+	int* host_internal_left_offset;
+	int* host_internal_right_offset;
+	num_thread_blocks_kernel_1 = CEIL(size_a, SIZE_SLICE_1_A) * CEIL(size_b, SIZE_SLICE_1_B) * CEIL(size_c, SIZE_SLICE_1_C) * CEIL(size_d, SIZE_SLICE_1_D);
+	// hipMalloc()
+	hipMalloc((void**) &dev_t3, sizeof(float) * size_a * size_b * size_c * size_d);
+	hipMalloc((void**) &dev_t2, sizeof(float) * size_f * size_b * size_e * size_a);
+	hipMalloc((void**) &dev_v2, sizeof(float) * size_e * size_c * size_f * size_d);
+	// hipMemcpy()
+	hipMemcpy(dev_t3, t3, sizeof(float) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice);
+	hipMemcpy(dev_t2, host_t2, sizeof(float) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice);
+	hipMemcpy(dev_v2, host_v2, sizeof(float) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice);
+	// Related to Kernels
+	// There are 1 Basic Kernels
+	long long int tmp_operations = (long long int)((long long int)(size_a * size_b * size_c * size_d) * size_e) * size_f;
+	printf ("========================================= fusedKernels =============================================\n");
+	printf ("		Grid Size  : %6d (1D)\n", num_thread_blocks_kernel_1);
+	printf ("		Block-size : %2d, %2d (2D)\n", SIZE_TB_1_X, SIZE_TB_1_Y);
+	printf ("		Reg.-size  : %2d, %2d (2D)\n", SIZE_REG_1_X, SIZE_REG_1_Y);
+	printf ("		A thread deals with (%d x %d) elements (basically)\n", SIZE_TB_1_X * SIZE_REG_1_X, SIZE_TB_1_Y * SIZE_REG_1_Y);
+	printf ("		# of Operations: %lld\n", tmp_operations);
+	printf ("====================================================================================================\n");
+	dim3 gridsize_1(num_thread_blocks_kernel_1);
+	dim3 blocksize_1(SIZE_TB_1_X, SIZE_TB_1_Y);
+	int stride_output_a = 1;
+	int stride_output_b = stride_output_a * size_a;
+	int stride_output_c = stride_output_b * size_b;
+	int stride_output_d = stride_output_c * size_c;
+	int stride_reg_x_1 = stride_output_b;
+	int stride_reg_y_1 = stride_output_c;
+	int size_internal = size_e * size_f;
+	// (manually) ['e', 'f']
+	host_internal_left_offset 	= (int*)malloc(sizeof(int) * size_internal);
+	host_internal_right_offset 	= (int*)malloc(sizeof(int) * size_internal);
+	for (int idx_f = 0; idx_f < size_f; idx_f++)
+	for (int idx_e = 0; idx_e < size_e; idx_e++)
+	{
+		host_internal_left_offset[idx_e + (idx_f) * size_e] 	= (idx_e + ((idx_f) * size_b) * size_e) * size_a;
+		host_internal_right_offset[idx_e + (idx_f) * size_e] 	= (idx_f + ((idx_e) * size_c) * size_f) * size_d;
+	}
+	hipMemcpyToSymbol(const_internal_t2_offset, host_internal_left_offset, sizeof(int) * size_internal);
+	hipMemcpyToSymbol(const_internal_v2_offset, host_internal_right_offset, sizeof(int) * size_internal);
+	int* dev_internal_offset_t2;
+	int* dev_internal_offset_v2;
+	// hipMalloc()
+	hipMalloc((void**) &dev_internal_offset_t2, sizeof(int) * size_internal);
+	hipMalloc((void**) &dev_internal_offset_v2, sizeof(int) * size_internal);
+	// hipMemcpy()
+	hipMemcpy(dev_internal_offset_t2, host_internal_left_offset, sizeof(int) * size_internal, hipMemcpyHostToDevice);
+	hipMemcpy(dev_internal_offset_v2, host_internal_right_offset, sizeof(int) * size_internal, hipMemcpyHostToDevice);
+	// Decision Tree for Kernel Types
+	// No Chance to Utilize the Register Transpose
+	if (size_a % SIZE_SLICE_1_A == 0 && size_b % SIZE_SLICE_1_B == 0 && size_c % SIZE_SLICE_1_C == 0 && size_d % SIZE_SLICE_1_D == 0)
+	{
+		// [2] Extenral Index: Full
+		if (size_e % SIZE_SLICE_1_E == 0 && size_f % SIZE_SLICE_1_F == 0)
+		{
+			// [3] Internal Index: Full
+			// >>> External: Full && Internal: Full
+			printf ("External: Full, Internal: Full\n");
+			if (size_internal > MAX_CONST_LEN)
+			{
+				kernel__1_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+			else
+			{
+				kernel__1_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+		}
+		else
+		{
+			// [4] Internal Index: Partial
+			// >>> External: Full && Internal: Partial
+			printf ("External: Full, Internal: Partial\n");
+			if (size_internal > MAX_CONST_LEN)
+			{
+				kernel__2_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+			else
+			{
+				kernel__2_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+		}
+	}
+	else
+	{
+		// [2] Extenral Index: Partial
+		if (size_e % SIZE_SLICE_1_E == 0 && size_f % SIZE_SLICE_1_F == 0)
+		{
+			// [3] Internal Index: Full
+			// >>> External: Partial && Internal: Full
+			printf ("External: Partial, Internal: Full\n");
+			if (size_internal > MAX_CONST_LEN)
+			{
+				kernel__3_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+			else
+			{
+				kernel__3_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+		}
+		else
+		{
+			// [4] Internal Index: Partial
+			// >>> External: Partial && Internal: Partial
+			printf ("External: Partial, Internal: Partial\n");
+			if (size_internal > MAX_CONST_LEN)
+			{
+				kernel__4_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+			else
+			{
+				kernel__4_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
+			}
+		}
+	}
+	// Copy the Result from Device to Host
+	hipMemcpy(t3, dev_t3, sizeof(float) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost);
+	// hipFree()
+	hipFree(dev_t3);	hipFree(dev_t2);	hipFree(dev_v2);
+	// Shoule be Fixed
+	// HostFree
+}
+// This is written by tc_interface.tc_gen_code_interface()
+// This Interface Should be Called to Run the Kernels
+extern "C"
+void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* t2, float* v2, int cond_kernel_1, int opt_register_transpose)
+{
+	// Pre-Processing for Split
+	// Based on Tile-Sizes and Problem-Size
+	// Currently, one index can be split into two indices
+	// Call An Application
+	sd_t_d2_fusion(size_a, size_b, size_c, size_d, size_e, size_f, t3, t2, v2, cond_kernel_1, opt_register_transpose);
+}
\ No newline at end of file
--- a/example/complex_contraction/Makefile
+++ b/example/complex_contraction/Makefile
+4D_kernel:
+	hipcc -O3 --offload-arch=gfx90a main.cpp 4D_kernel.hpp -o $@
+clean:
+	rm 4D_kernel
\ No newline at end of file
--- a/example/complex_contraction/main.cpp
+++ b/example/complex_contraction/main.cpp
+//
+//	Sample Code:
+//
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//#define DEBUG_CORRECTNESS
+//#define DEBUG_SIMPLE_CORRECTNESS
+void pre_Initializing_Input_Tensors();
+void post_Correctness();
+//
+//  t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
+//
+int main(int argc, char** argv)
+{
+	// for sd2
+	float *host_C, *host_C_chk;
+	float *host_A;
+	float *host_B;
+	int size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f;
+	// Problem Size
+	size_idx_a = 16;
+	size_idx_b = 16;
+	size_idx_c = 16;
+	size_idx_d = 16;
+	size_idx_e = 16;
+	size_idx_f = 16;
+	//
+	if (argc == 7)
+	{
+		size_idx_a = atoi(argv[1]);
+		size_idx_b = atoi(argv[2]);
+		size_idx_c = atoi(argv[3]);
+		size_idx_d = atoi(argv[4]);
+		size_idx_e = atoi(argv[5]);
+		size_idx_f = atoi(argv[6]);
+	}
+	int size_C;
+	int size_A;
+	int size_B;
+	int size_internal;
+	//  t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
+	size_internal 	= size_idx_e * size_idx_f;
+	size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
+	size_A = size_idx_a * size_idx_e * size_idx_b * size_idx_f;
+	size_B = size_idx_d * size_idx_f * size_idx_c * size_idx_e;
+    //
+	host_C 		= (float*)malloc(sizeof(float) * size_C);
+	host_C_chk 	= (float*)malloc(sizeof(float) * size_C);
+	host_A 		= (float*)malloc(sizeof(float) * size_A);
+	host_B 		= (float*)malloc(sizeof(float) * size_B);
+	printf ("==========================================================================================================\n");
+    printf (">>> abcd-aebf-dfce\n");
+    printf (">>> t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];\n");
+    printf (">>> Problem Size (a,b,c,d) and (e,f): (%2d,%2d,%2d,%2d) and (%2d,%2d)\n", size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f);
+	printf ("==========================================================================================================\n");
+	// Initialze "1" Output and "2 x 9" Inputs
+	pre_Initializing_Input_Tensors(host_C, host_C_chk, size_C, host_A, size_A, host_B, size_B);
+    // Run the Kernels
+	sd_t_d2_fusion_(size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f, host_C, host_A, host_B, 1, -1);
+#ifdef DEBUG_CORRECTNESS
+	// Correctness-Check
+	post_Correctness(host_C, host_C_chk,  host_A, host_B, size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f);
+#endif
+	// Free
+	free(host_C);   free(host_C_chk);
+	free(host_A);
+	free(host_B);
+	return 0;
+}
+// Initialize t3 (t3_temp), 9 t2 and 9 v2.
+void pre_Initializing_Input_Tensors(float* h_C, float* h_C_chk, int size_C, float* h_A, int size_A, float* h_B, int size_B)
+{
+	// t3
+	int i, j;
+	for (i = 0; i < size_C; i++)
+	{
+		h_C[i] 	= 0.0;
+		h_C_chk[i] = 0.0;
+	}
+	for (j = 0; j < size_A; j++)
+	{
+		h_A[j] = ((float)rand() / RAND_MAX);
+	}
+	for (j = 0; j < size_B; j++)
+	{
+		h_B[j] = ((float)rand() / RAND_MAX);
+	}
+}
+//
+void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f)
+{
+    //  t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
+    int size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
+	long long int    tmp_ops = 0;
+	int              ops     = 0;
+    int idx_a, idx_b, idx_c, idx_d, idx_e, idx_f;
+    for (idx_a = 0; idx_a < size_idx_a; idx_a++)
+    for (idx_b = 0; idx_b < size_idx_b; idx_b++)
+    for (idx_c = 0; idx_c < size_idx_c; idx_c++)
+    for (idx_d = 0; idx_d < size_idx_d; idx_d++)
+	{
+		int tmp_r_idx = idx_a + (idx_b + (idx_c + (idx_d) * size_idx_c) * size_idx_b) * size_idx_a;
+#ifdef DEBUG_SIMPLE_CORRECTNESS
+        if (tmp_r_idx > 1024)
+        break;
+#endif
+		for (idx_e = 0; idx_e < size_idx_e; idx_e++, ops = 0)
+        {
+            for (idx_f = 0; idx_f < size_idx_f; idx_f++)
+            {   
+                h_C_chk[tmp_r_idx] += 	h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a] * 
+                                        h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d];
+                ops++;
+            }
+            tmp_ops = tmp_ops + ops;
+        }
+	}
+	printf ("======================================= Correctness Check ==========================================\n");
+	float   epsilon = 0.00000001;
+	int      diff    = 0;
+	int      same    = 0;
+	int 	 i;
+	for (i = 0; i < size_C; i++)
+	{
+		float check = h_C_chk[i] - h_C[i];
+		if (check < 0) check *= -1;
+		if (check > epsilon)
+		{
+			diff++;
+			if (diff < 8)
+			printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i], h_C[i], check);
+		}
+		else
+		{
+			same++;
+		}
+	}
+	printf (" >>> PASSED: %'10d among %'10d in t3\n", same, size_C);
+	printf (" >>> ERROR : %'10d among %'10d in t3\n", diff, size_C);
+	printf (" >>> Total Operations: %'lld\n", tmp_ops * 2);
+	printf ("====================================================================================================\n");
+}