Commit f0a8ee84 authored by Muhammed Ozturk's avatar Muhammed Ozturk
Browse files

complex type contraction

parent 59136091
// created by tc_code_include() in tc_code_include.py
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <locale.h>
#include <algorithm>
using namespace std;
// created by tc_gen_definition_new()
#define SIZE_SLICE_1_E 16
#define SIZE_SLICE_1_F 1
#define SIZE_SLICE_1_A 16
#define SIZE_SLICE_1_B 6
#define SIZE_SLICE_1_D 16
#define SIZE_SLICE_1_C 6
#define SIZE_INT_UNIT_1 SIZE_SLICE_1_E * SIZE_SLICE_1_F
#define SIZE_TB_1_X SIZE_SLICE_1_A
#define SIZE_TB_1_Y SIZE_SLICE_1_D
#define SIZE_REG_1_X SIZE_SLICE_1_B
#define SIZE_REG_1_Y SIZE_SLICE_1_C
#define NUM_INDEX 4
#define CEIL(a, b) (((a) + (b) - 1) / (b))
// Not Yet: Multiple Tensor Contractions.
// |Constant Memory| = 64KB, 16K Words(Integer), which means |K| <= 8192
#define MAX_CONST_LEN 8192
__constant__ int const_internal_t2_offset[MAX_CONST_LEN];
__constant__ int const_internal_v2_offset[MAX_CONST_LEN];
// created by tc_gen_code_Kernel()
__global__ void kernel__1_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for (int ll = 0; ll < 6; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for (int ll = 0; ll < 6; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__2_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
int internal_upperbound = 0;
int internal_offset;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
// Part: Generalized Contraction Index (p7b)
internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
if (internal_offset > 0) internal_upperbound = internal_offset;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < 6; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < 6; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__3_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
// need to support partial tiles
int rng_a, rng_b, rng_c, rng_d;
if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
{
rng_a = SIZE_SLICE_1_A;
}
else
{
rng_a = size_a % SIZE_SLICE_1_A;
}
if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
{
rng_b = SIZE_SLICE_1_B;
}
else
{
rng_b = size_b % SIZE_SLICE_1_B;
}
if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
{
rng_c = SIZE_SLICE_1_C;
}
else
{
rng_c = size_c % SIZE_SLICE_1_C;
}
if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
{
rng_d = SIZE_SLICE_1_D;
}
else
{
rng_d = size_d % SIZE_SLICE_1_D;
}
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_a)
for (int ll = 0; ll < rng_b; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_d)
for (int ll = 0; ll < rng_c; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if (idx_a < rng_a && idx_d < rng_d)
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
if(i < rng_c && j < rng_b)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__4_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
int internal_upperbound = 0;
int internal_offset;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
// need to support partial tiles
int rng_a, rng_b, rng_c, rng_d;
if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
{
rng_a = SIZE_SLICE_1_A;
}
else
{
rng_a = size_a % SIZE_SLICE_1_A;
}
if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
{
rng_b = SIZE_SLICE_1_B;
}
else
{
rng_b = size_b % SIZE_SLICE_1_B;
}
if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
{
rng_c = SIZE_SLICE_1_C;
}
else
{
rng_c = size_c % SIZE_SLICE_1_C;
}
if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
{
rng_d = SIZE_SLICE_1_D;
}
else
{
rng_d = size_d % SIZE_SLICE_1_D;
}
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
// Part: Generalized Contraction Index (p7b)
internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
if (internal_offset > 0) internal_upperbound = internal_offset;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_a && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < rng_b; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + const_internal_t2_offset[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_d && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < rng_c; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + const_internal_v2_offset[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if (idx_a < rng_a && idx_d < rng_d)
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
if(i < rng_c && j < rng_b)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__1_tex_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for (int ll = 0; ll < 6; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for (int ll = 0; ll < 6; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__2_tex_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
int internal_upperbound = 0;
int internal_offset;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
// Part: Generalized Contraction Index (p7b)
internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
if (internal_offset > 0) internal_upperbound = internal_offset;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < 6; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < 6; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__3_tex_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
// need to support partial tiles
int rng_a, rng_b, rng_c, rng_d;
if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
{
rng_a = SIZE_SLICE_1_A;
}
else
{
rng_a = size_a % SIZE_SLICE_1_A;
}
if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
{
rng_b = SIZE_SLICE_1_B;
}
else
{
rng_b = size_b % SIZE_SLICE_1_B;
}
if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
{
rng_c = SIZE_SLICE_1_C;
}
else
{
rng_c = size_c % SIZE_SLICE_1_C;
}
if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
{
rng_d = SIZE_SLICE_1_D;
}
else
{
rng_d = size_d % SIZE_SLICE_1_D;
}
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_a)
for (int ll = 0; ll < rng_b; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_d)
for (int ll = 0; ll < rng_c; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if (idx_a < rng_a && idx_d < rng_d)
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
if(i < rng_c && j < rng_b)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__ void kernel__4_tex_1(float* dev_t3,
float* dev_t2,
float* dev_v2,
int size_a, int size_b, int size_c, int size_d, int size_e, int size_f,
int numBlk_a, int numBlk_b, int numBlk_c, int numBlk_d,
int* dev_internal_offset_t2, int* dev_internal_offset_v2,
int stride_reg_x, int stride_reg_y,
int size_internal)
{
// For Shared Memory,
__shared__ float sm_a[16][96];
__shared__ float sm_b[16][96];
int internal_upperbound = 0;
int internal_offset;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int idx_a = threadIdx.x;
int idx_d = threadIdx.y;
int tmp_blkIdx;
int blk_idx_d = blockIdx.x / (numBlk_c * numBlk_b * numBlk_a);
tmp_blkIdx = blockIdx.x % (numBlk_c * numBlk_b * numBlk_a);
int blk_idx_c = tmp_blkIdx / (numBlk_b * numBlk_a);
tmp_blkIdx = tmp_blkIdx % (numBlk_b * numBlk_a);
int blk_idx_b = tmp_blkIdx / numBlk_a;
tmp_blkIdx = tmp_blkIdx % (numBlk_a);
int blk_idx_a = tmp_blkIdx;
int t3_base_thread = blk_idx_a * SIZE_SLICE_1_A + idx_a + (blk_idx_b * SIZE_SLICE_1_B + (blk_idx_c * SIZE_SLICE_1_C + (blk_idx_d * SIZE_SLICE_1_D + idx_d) * size_c) * size_b) * size_a;
// need to support partial tiles
int rng_a, rng_b, rng_c, rng_d;
if ((size_a - (blk_idx_a * SIZE_SLICE_1_A)) >= SIZE_SLICE_1_A)
{
rng_a = SIZE_SLICE_1_A;
}
else
{
rng_a = size_a % SIZE_SLICE_1_A;
}
if ((size_b - (blk_idx_b * SIZE_SLICE_1_B)) >= SIZE_SLICE_1_B)
{
rng_b = SIZE_SLICE_1_B;
}
else
{
rng_b = size_b % SIZE_SLICE_1_B;
}
if ((size_c - (blk_idx_c * SIZE_SLICE_1_C)) >= SIZE_SLICE_1_C)
{
rng_c = SIZE_SLICE_1_C;
}
else
{
rng_c = size_c % SIZE_SLICE_1_C;
}
if ((size_d - (blk_idx_d * SIZE_SLICE_1_D)) >= SIZE_SLICE_1_D)
{
rng_d = SIZE_SLICE_1_D;
}
else
{
rng_d = size_d % SIZE_SLICE_1_D;
}
float temp_av;
float temp_bv[6];
float reg_tile[6][6];
for (int i = 0; i < 6; i++)
for (int j = 0; j < 6; j++)
reg_tile[i][j] = 0.0;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for (int l = 0; l < size_internal; l += SIZE_INT_UNIT_1)
{
// Part: Generalized Contraction Index (p7b)
internal_offset = (l + SIZE_INT_UNIT_1) - size_internal;
if (internal_offset > 0) internal_upperbound = internal_offset;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_a && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < rng_b; ll++)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a[threadIdx.y][threadIdx.x + ll * 16] = dev_t2[blk_idx_a * SIZE_SLICE_1_A + idx_a + ((blk_idx_b * SIZE_SLICE_1_B + ll) * size_e) * size_a + dev_internal_offset_t2[threadIdx.y + l]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if (idx_a < rng_d && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)
for (int ll = 0; ll < rng_c; ll++)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b[threadIdx.y][threadIdx.x + ll * 16] = dev_v2[blk_idx_d * SIZE_SLICE_1_D + idx_a + ((blk_idx_c * SIZE_SLICE_1_C + ll) * size_f) * size_d + dev_internal_offset_v2[threadIdx.y + l]];
}
__syncthreads();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for (int ll = 0; ll < SIZE_INT_UNIT_1 - internal_upperbound; ll++)
{
temp_bv[0] = sm_b[ll][idx_d + 0];
temp_bv[1] = sm_b[ll][idx_d + 16];
temp_bv[2] = sm_b[ll][idx_d + 32];
temp_bv[3] = sm_b[ll][idx_d + 48];
temp_bv[4] = sm_b[ll][idx_d + 64];
temp_bv[5] = sm_b[ll][idx_d + 80];
for (int xx = 0; xx < 6; xx++) // (1)
{
temp_av = sm_a[ll][idx_a + (xx * 16)];
reg_tile[0][xx] += temp_av * temp_bv[0];
reg_tile[1][xx] += temp_av * temp_bv[1];
reg_tile[2][xx] += temp_av * temp_bv[2];
reg_tile[3][xx] += temp_av * temp_bv[3];
reg_tile[4][xx] += temp_av * temp_bv[4];
reg_tile[5][xx] += temp_av * temp_bv[5];
}
}
__syncthreads();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if (idx_a < rng_a && idx_d < rng_d)
for (int i = 0; i < 6; i++)
{
for (int j = 0; j < 6; j++)
{
if(i < rng_c && j < rng_b)
{
dev_t3[t3_base_thread + (i * stride_reg_y) + (j * stride_reg_x)] = reg_tile[i][j];
}
}
}
}
// written by tc_interface.tc_gen_code_interface_Header()
extern "C"
void sd_t_d2_fusion(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* host_t2, float* host_v2, int cond_kernel_1, int opt_register_transpose)
{
int num_thread_blocks_kernel_1;
float* dev_t3;
float* dev_t2;
float* dev_v2;
int* host_internal_left_offset;
int* host_internal_right_offset;
num_thread_blocks_kernel_1 = CEIL(size_a, SIZE_SLICE_1_A) * CEIL(size_b, SIZE_SLICE_1_B) * CEIL(size_c, SIZE_SLICE_1_C) * CEIL(size_d, SIZE_SLICE_1_D);
// hipMalloc()
hipMalloc((void**) &dev_t3, sizeof(float) * size_a * size_b * size_c * size_d);
hipMalloc((void**) &dev_t2, sizeof(float) * size_f * size_b * size_e * size_a);
hipMalloc((void**) &dev_v2, sizeof(float) * size_e * size_c * size_f * size_d);
// hipMemcpy()
hipMemcpy(dev_t3, t3, sizeof(float) * size_a * size_b * size_c * size_d, hipMemcpyHostToDevice);
hipMemcpy(dev_t2, host_t2, sizeof(float) * size_f * size_b * size_e * size_a, hipMemcpyHostToDevice);
hipMemcpy(dev_v2, host_v2, sizeof(float) * size_e * size_c * size_f * size_d, hipMemcpyHostToDevice);
// Related to Kernels
// There are 1 Basic Kernels
long long int tmp_operations = (long long int)((long long int)(size_a * size_b * size_c * size_d) * size_e) * size_f;
printf ("========================================= fusedKernels =============================================\n");
printf (" Grid Size : %6d (1D)\n", num_thread_blocks_kernel_1);
printf (" Block-size : %2d, %2d (2D)\n", SIZE_TB_1_X, SIZE_TB_1_Y);
printf (" Reg.-size : %2d, %2d (2D)\n", SIZE_REG_1_X, SIZE_REG_1_Y);
printf (" A thread deals with (%d x %d) elements (basically)\n", SIZE_TB_1_X * SIZE_REG_1_X, SIZE_TB_1_Y * SIZE_REG_1_Y);
printf (" # of Operations: %lld\n", tmp_operations);
printf ("====================================================================================================\n");
dim3 gridsize_1(num_thread_blocks_kernel_1);
dim3 blocksize_1(SIZE_TB_1_X, SIZE_TB_1_Y);
int stride_output_a = 1;
int stride_output_b = stride_output_a * size_a;
int stride_output_c = stride_output_b * size_b;
int stride_output_d = stride_output_c * size_c;
int stride_reg_x_1 = stride_output_b;
int stride_reg_y_1 = stride_output_c;
int size_internal = size_e * size_f;
// (manually) ['e', 'f']
host_internal_left_offset = (int*)malloc(sizeof(int) * size_internal);
host_internal_right_offset = (int*)malloc(sizeof(int) * size_internal);
for (int idx_f = 0; idx_f < size_f; idx_f++)
for (int idx_e = 0; idx_e < size_e; idx_e++)
{
host_internal_left_offset[idx_e + (idx_f) * size_e] = (idx_e + ((idx_f) * size_b) * size_e) * size_a;
host_internal_right_offset[idx_e + (idx_f) * size_e] = (idx_f + ((idx_e) * size_c) * size_f) * size_d;
}
hipMemcpyToSymbol(const_internal_t2_offset, host_internal_left_offset, sizeof(int) * size_internal);
hipMemcpyToSymbol(const_internal_v2_offset, host_internal_right_offset, sizeof(int) * size_internal);
int* dev_internal_offset_t2;
int* dev_internal_offset_v2;
// hipMalloc()
hipMalloc((void**) &dev_internal_offset_t2, sizeof(int) * size_internal);
hipMalloc((void**) &dev_internal_offset_v2, sizeof(int) * size_internal);
// hipMemcpy()
hipMemcpy(dev_internal_offset_t2, host_internal_left_offset, sizeof(int) * size_internal, hipMemcpyHostToDevice);
hipMemcpy(dev_internal_offset_v2, host_internal_right_offset, sizeof(int) * size_internal, hipMemcpyHostToDevice);
// Decision Tree for Kernel Types
// No Chance to Utilize the Register Transpose
if (size_a % SIZE_SLICE_1_A == 0 && size_b % SIZE_SLICE_1_B == 0 && size_c % SIZE_SLICE_1_C == 0 && size_d % SIZE_SLICE_1_D == 0)
{
// [2] Extenral Index: Full
if (size_e % SIZE_SLICE_1_E == 0 && size_f % SIZE_SLICE_1_F == 0)
{
// [3] Internal Index: Full
// >>> External: Full && Internal: Full
printf ("External: Full, Internal: Full\n");
if (size_internal > MAX_CONST_LEN)
{
kernel__1_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
}
else
{
kernel__1_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
}
}
else
{
// [4] Internal Index: Partial
// >>> External: Full && Internal: Partial
printf ("External: Full, Internal: Partial\n");
if (size_internal > MAX_CONST_LEN)
{
kernel__2_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
}
else
{
kernel__2_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
}
}
}
else
{
// [2] Extenral Index: Partial
if (size_e % SIZE_SLICE_1_E == 0 && size_f % SIZE_SLICE_1_F == 0)
{
// [3] Internal Index: Full
// >>> External: Partial && Internal: Full
printf ("External: Partial, Internal: Full\n");
if (size_internal > MAX_CONST_LEN)
{
kernel__3_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
}
else
{
kernel__3_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
}
}
else
{
// [4] Internal Index: Partial
// >>> External: Partial && Internal: Partial
printf ("External: Partial, Internal: Partial\n");
if (size_internal > MAX_CONST_LEN)
{
kernel__4_tex_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), dev_internal_offset_t2, dev_internal_offset_v2, stride_reg_x_1, stride_reg_y_1, size_internal);
}
else
{
kernel__4_1<<<gridsize_1, blocksize_1>>>(dev_t3, dev_t2, dev_v2, size_a, size_b, size_c, size_d, size_e, size_f, CEIL(size_a, SIZE_SLICE_1_A), CEIL(size_b, SIZE_SLICE_1_B), CEIL(size_c, SIZE_SLICE_1_C), CEIL(size_d, SIZE_SLICE_1_D), stride_reg_x_1, stride_reg_y_1, size_internal);
}
}
}
// Copy the Result from Device to Host
hipMemcpy(t3, dev_t3, sizeof(float) * (size_a * size_b * size_c * size_d), hipMemcpyDeviceToHost);
// hipFree()
hipFree(dev_t3); hipFree(dev_t2); hipFree(dev_v2);
// Shoule be Fixed
// HostFree
}
// This is written by tc_interface.tc_gen_code_interface()
// This Interface Should be Called to Run the Kernels
extern "C"
void sd_t_d2_fusion_(int size_a, int size_b, int size_c, int size_d, int size_e, int size_f, float* t3, float* t2, float* v2, int cond_kernel_1, int opt_register_transpose)
{
// Pre-Processing for Split
// Based on Tile-Sizes and Problem-Size
// Currently, one index can be split into two indices
// Call An Application
sd_t_d2_fusion(size_a, size_b, size_c, size_d, size_e, size_f, t3, t2, v2, cond_kernel_1, opt_register_transpose);
}
\ No newline at end of file
4D_kernel:
hipcc -O3 --offload-arch=gfx90a main.cpp 4D_kernel.hpp -o $@
clean:
rm 4D_kernel
\ No newline at end of file
//
// Sample Code:
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#define DEBUG_CORRECTNESS
//#define DEBUG_SIMPLE_CORRECTNESS
void pre_Initializing_Input_Tensors();
void post_Correctness();
//
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
//
int main(int argc, char** argv)
{
// for sd2
float *host_C, *host_C_chk;
float *host_A;
float *host_B;
int size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f;
// Problem Size
size_idx_a = 16;
size_idx_b = 16;
size_idx_c = 16;
size_idx_d = 16;
size_idx_e = 16;
size_idx_f = 16;
//
if (argc == 7)
{
size_idx_a = atoi(argv[1]);
size_idx_b = atoi(argv[2]);
size_idx_c = atoi(argv[3]);
size_idx_d = atoi(argv[4]);
size_idx_e = atoi(argv[5]);
size_idx_f = atoi(argv[6]);
}
int size_C;
int size_A;
int size_B;
int size_internal;
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
size_internal = size_idx_e * size_idx_f;
size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
size_A = size_idx_a * size_idx_e * size_idx_b * size_idx_f;
size_B = size_idx_d * size_idx_f * size_idx_c * size_idx_e;
//
host_C = (float*)malloc(sizeof(float) * size_C);
host_C_chk = (float*)malloc(sizeof(float) * size_C);
host_A = (float*)malloc(sizeof(float) * size_A);
host_B = (float*)malloc(sizeof(float) * size_B);
printf ("==========================================================================================================\n");
printf (">>> abcd-aebf-dfce\n");
printf (">>> t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];\n");
printf (">>> Problem Size (a,b,c,d) and (e,f): (%2d,%2d,%2d,%2d) and (%2d,%2d)\n", size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f);
printf ("==========================================================================================================\n");
// Initialze "1" Output and "2 x 9" Inputs
pre_Initializing_Input_Tensors(host_C, host_C_chk, size_C, host_A, size_A, host_B, size_B);
// Run the Kernels
sd_t_d2_fusion_(size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f, host_C, host_A, host_B, 1, -1);
#ifdef DEBUG_CORRECTNESS
// Correctness-Check
post_Correctness(host_C, host_C_chk, host_A, host_B, size_idx_a, size_idx_b, size_idx_c, size_idx_d, size_idx_e, size_idx_f);
#endif
// Free
free(host_C); free(host_C_chk);
free(host_A);
free(host_B);
return 0;
}
// Initialize t3 (t3_temp), 9 t2 and 9 v2.
void pre_Initializing_Input_Tensors(float* h_C, float* h_C_chk, int size_C, float* h_A, int size_A, float* h_B, int size_B)
{
// t3
int i, j;
for (i = 0; i < size_C; i++)
{
h_C[i] = 0.0;
h_C_chk[i] = 0.0;
}
for (j = 0; j < size_A; j++)
{
h_A[j] = ((float)rand() / RAND_MAX);
}
for (j = 0; j < size_B; j++)
{
h_B[j] = ((float)rand() / RAND_MAX);
}
}
//
void post_Correctness(float* h_C, float* h_C_chk, float* h_A, float* h_B, int size_idx_a, int size_idx_b, int size_idx_c, int size_idx_d, int size_idx_e, int size_idx_f)
{
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
int size_C = size_idx_a * size_idx_b * size_idx_c * size_idx_d;
long long int tmp_ops = 0;
int ops = 0;
int idx_a, idx_b, idx_c, idx_d, idx_e, idx_f;
for (idx_a = 0; idx_a < size_idx_a; idx_a++)
for (idx_b = 0; idx_b < size_idx_b; idx_b++)
for (idx_c = 0; idx_c < size_idx_c; idx_c++)
for (idx_d = 0; idx_d < size_idx_d; idx_d++)
{
int tmp_r_idx = idx_a + (idx_b + (idx_c + (idx_d) * size_idx_c) * size_idx_b) * size_idx_a;
#ifdef DEBUG_SIMPLE_CORRECTNESS
if (tmp_r_idx > 1024)
break;
#endif
for (idx_e = 0; idx_e < size_idx_e; idx_e++, ops = 0)
{
for (idx_f = 0; idx_f < size_idx_f; idx_f++)
{
h_C_chk[tmp_r_idx] += h_A[idx_a + (idx_e + (idx_b + (idx_f) * size_idx_b) * size_idx_e) * size_idx_a] *
h_B[idx_d + (idx_f + (idx_c + (idx_e) * size_idx_c) * size_idx_f) * size_idx_d];
ops++;
}
tmp_ops = tmp_ops + ops;
}
}
printf ("======================================= Correctness Check ==========================================\n");
float epsilon = 0.00000001;
int diff = 0;
int same = 0;
int i;
for (i = 0; i < size_C; i++)
{
float check = h_C_chk[i] - h_C[i];
if (check < 0) check *= -1;
if (check > epsilon)
{
diff++;
if (diff < 8)
printf ("Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f\n", i, h_C_chk[i], h_C[i], check);
}
else
{
same++;
}
}
printf (" >>> PASSED: %'10d among %'10d in t3\n", same, size_C);
printf (" >>> ERROR : %'10d among %'10d in t3\n", diff, size_C);
printf (" >>> Total Operations: %'lld\n", tmp_ops * 2);
printf ("====================================================================================================\n");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment