Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f0a8ee84
"vscode:/vscode.git/clone" did not exist on "3552bca1015f1257b8efbd0567a60a5862ea11e7"
Commit
f0a8ee84
authored
Oct 10, 2023
by
Muhammed Ozturk
Browse files
complex type contraction
parent
59136091
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1451 additions
and
0 deletions
+1451
-0
example/complex_contraction/4D_kernel.hpp
example/complex_contraction/4D_kernel.hpp
+1277
-0
example/complex_contraction/Makefile
example/complex_contraction/Makefile
+8
-0
example/complex_contraction/main.cpp
example/complex_contraction/main.cpp
+166
-0
No files found.
example/complex_contraction/4D_kernel.hpp
0 → 100644
View file @
f0a8ee84
// created by tc_code_include() in tc_code_include.py
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <locale.h>
#include <algorithm>
using
namespace
std
;
// created by tc_gen_definition_new()
#define SIZE_SLICE_1_E 16
#define SIZE_SLICE_1_F 1
#define SIZE_SLICE_1_A 16
#define SIZE_SLICE_1_B 6
#define SIZE_SLICE_1_D 16
#define SIZE_SLICE_1_C 6
#define SIZE_INT_UNIT_1 SIZE_SLICE_1_E * SIZE_SLICE_1_F
#define SIZE_TB_1_X SIZE_SLICE_1_A
#define SIZE_TB_1_Y SIZE_SLICE_1_D
#define SIZE_REG_1_X SIZE_SLICE_1_B
#define SIZE_REG_1_Y SIZE_SLICE_1_C
#define NUM_INDEX 4
#define CEIL(a, b) (((a) + (b) - 1) / (b))
// Not Yet: Multiple Tensor Contractions.
// |Constant Memory| = 64KB, 16K Words(Integer), which means |K| <= 8192
#define MAX_CONST_LEN 8192
__constant__
int
const_internal_t2_offset
[
MAX_CONST_LEN
];
__constant__
int
const_internal_v2_offset
[
MAX_CONST_LEN
];
// created by tc_gen_code_Kernel()
__global__
void
kernel__1_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
const_internal_t2_offset
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
const_internal_v2_offset
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__2_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
int
internal_upperbound
=
0
;
int
internal_offset
;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
// Part: Generalized Contraction Index (p7b)
internal_offset
=
(
l
+
SIZE_INT_UNIT_1
)
-
size_internal
;
if
(
internal_offset
>
0
)
internal_upperbound
=
internal_offset
;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
const_internal_t2_offset
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
const_internal_v2_offset
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
-
internal_upperbound
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__3_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
// need to support partial tiles
int
rng_a
,
rng_b
,
rng_c
,
rng_d
;
if
((
size_a
-
(
blk_idx_a
*
SIZE_SLICE_1_A
))
>=
SIZE_SLICE_1_A
)
{
rng_a
=
SIZE_SLICE_1_A
;
}
else
{
rng_a
=
size_a
%
SIZE_SLICE_1_A
;
}
if
((
size_b
-
(
blk_idx_b
*
SIZE_SLICE_1_B
))
>=
SIZE_SLICE_1_B
)
{
rng_b
=
SIZE_SLICE_1_B
;
}
else
{
rng_b
=
size_b
%
SIZE_SLICE_1_B
;
}
if
((
size_c
-
(
blk_idx_c
*
SIZE_SLICE_1_C
))
>=
SIZE_SLICE_1_C
)
{
rng_c
=
SIZE_SLICE_1_C
;
}
else
{
rng_c
=
size_c
%
SIZE_SLICE_1_C
;
}
if
((
size_d
-
(
blk_idx_d
*
SIZE_SLICE_1_D
))
>=
SIZE_SLICE_1_D
)
{
rng_d
=
SIZE_SLICE_1_D
;
}
else
{
rng_d
=
size_d
%
SIZE_SLICE_1_D
;
}
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_a
)
for
(
int
ll
=
0
;
ll
<
rng_b
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
const_internal_t2_offset
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_d
)
for
(
int
ll
=
0
;
ll
<
rng_c
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
const_internal_v2_offset
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if
(
idx_a
<
rng_a
&&
idx_d
<
rng_d
)
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
if
(
i
<
rng_c
&&
j
<
rng_b
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__4_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
int
internal_upperbound
=
0
;
int
internal_offset
;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
// need to support partial tiles
int
rng_a
,
rng_b
,
rng_c
,
rng_d
;
if
((
size_a
-
(
blk_idx_a
*
SIZE_SLICE_1_A
))
>=
SIZE_SLICE_1_A
)
{
rng_a
=
SIZE_SLICE_1_A
;
}
else
{
rng_a
=
size_a
%
SIZE_SLICE_1_A
;
}
if
((
size_b
-
(
blk_idx_b
*
SIZE_SLICE_1_B
))
>=
SIZE_SLICE_1_B
)
{
rng_b
=
SIZE_SLICE_1_B
;
}
else
{
rng_b
=
size_b
%
SIZE_SLICE_1_B
;
}
if
((
size_c
-
(
blk_idx_c
*
SIZE_SLICE_1_C
))
>=
SIZE_SLICE_1_C
)
{
rng_c
=
SIZE_SLICE_1_C
;
}
else
{
rng_c
=
size_c
%
SIZE_SLICE_1_C
;
}
if
((
size_d
-
(
blk_idx_d
*
SIZE_SLICE_1_D
))
>=
SIZE_SLICE_1_D
)
{
rng_d
=
SIZE_SLICE_1_D
;
}
else
{
rng_d
=
size_d
%
SIZE_SLICE_1_D
;
}
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
// Part: Generalized Contraction Index (p7b)
internal_offset
=
(
l
+
SIZE_INT_UNIT_1
)
-
size_internal
;
if
(
internal_offset
>
0
)
internal_upperbound
=
internal_offset
;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_a
&&
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
rng_b
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
const_internal_t2_offset
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_d
&&
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
rng_c
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
const_internal_v2_offset
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
-
internal_upperbound
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if
(
idx_a
<
rng_a
&&
idx_d
<
rng_d
)
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
if
(
i
<
rng_c
&&
j
<
rng_b
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__1_tex_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
*
dev_internal_offset_t2
,
int
*
dev_internal_offset_v2
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
dev_internal_offset_t2
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
// No Need to Put Boundary-Checks before For-Statement: :
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
dev_internal_offset_v2
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__2_tex_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
*
dev_internal_offset_t2
,
int
*
dev_internal_offset_v2
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
int
internal_upperbound
=
0
;
int
internal_offset
;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
// Part: Generalized Contraction Index (p7b)
internal_offset
=
(
l
+
SIZE_INT_UNIT_1
)
-
size_internal
;
if
(
internal_offset
>
0
)
internal_upperbound
=
internal_offset
;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
dev_internal_offset_t2
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
6
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
dev_internal_offset_v2
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
-
internal_upperbound
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
#pragma unroll 6
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__3_tex_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
*
dev_internal_offset_t2
,
int
*
dev_internal_offset_v2
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
// need to support partial tiles
int
rng_a
,
rng_b
,
rng_c
,
rng_d
;
if
((
size_a
-
(
blk_idx_a
*
SIZE_SLICE_1_A
))
>=
SIZE_SLICE_1_A
)
{
rng_a
=
SIZE_SLICE_1_A
;
}
else
{
rng_a
=
size_a
%
SIZE_SLICE_1_A
;
}
if
((
size_b
-
(
blk_idx_b
*
SIZE_SLICE_1_B
))
>=
SIZE_SLICE_1_B
)
{
rng_b
=
SIZE_SLICE_1_B
;
}
else
{
rng_b
=
size_b
%
SIZE_SLICE_1_B
;
}
if
((
size_c
-
(
blk_idx_c
*
SIZE_SLICE_1_C
))
>=
SIZE_SLICE_1_C
)
{
rng_c
=
SIZE_SLICE_1_C
;
}
else
{
rng_c
=
size_c
%
SIZE_SLICE_1_C
;
}
if
((
size_d
-
(
blk_idx_d
*
SIZE_SLICE_1_D
))
>=
SIZE_SLICE_1_D
)
{
rng_d
=
SIZE_SLICE_1_D
;
}
else
{
rng_d
=
size_d
%
SIZE_SLICE_1_D
;
}
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_a
)
for
(
int
ll
=
0
;
ll
<
rng_b
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
dev_internal_offset_t2
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_d
)
for
(
int
ll
=
0
;
ll
<
rng_c
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
dev_internal_offset_v2
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if
(
idx_a
<
rng_a
&&
idx_d
<
rng_d
)
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
if
(
i
<
rng_c
&&
j
<
rng_b
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
}
// created by tc_gen_code_Kernel()
__global__
void
kernel__4_tex_1
(
float
*
dev_t3
,
float
*
dev_t2
,
float
*
dev_v2
,
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
int
numBlk_a
,
int
numBlk_b
,
int
numBlk_c
,
int
numBlk_d
,
int
*
dev_internal_offset_t2
,
int
*
dev_internal_offset_v2
,
int
stride_reg_x
,
int
stride_reg_y
,
int
size_internal
)
{
// For Shared Memory,
__shared__
float
sm_a
[
16
][
96
];
__shared__
float
sm_b
[
16
][
96
];
int
internal_upperbound
=
0
;
int
internal_offset
;
// when opt_pre_computed == -1, all indices will be calculated manually
// # of indices mapped on TB_X: 1
// # of indices mapped on TB_Y: 1
int
idx_a
=
threadIdx
.
x
;
int
idx_d
=
threadIdx
.
y
;
int
tmp_blkIdx
;
int
blk_idx_d
=
blockIdx
.
x
/
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
blockIdx
.
x
%
(
numBlk_c
*
numBlk_b
*
numBlk_a
);
int
blk_idx_c
=
tmp_blkIdx
/
(
numBlk_b
*
numBlk_a
);
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_b
*
numBlk_a
);
int
blk_idx_b
=
tmp_blkIdx
/
numBlk_a
;
tmp_blkIdx
=
tmp_blkIdx
%
(
numBlk_a
);
int
blk_idx_a
=
tmp_blkIdx
;
int
t3_base_thread
=
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
(
blk_idx_b
*
SIZE_SLICE_1_B
+
(
blk_idx_c
*
SIZE_SLICE_1_C
+
(
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_d
)
*
size_c
)
*
size_b
)
*
size_a
;
// need to support partial tiles
int
rng_a
,
rng_b
,
rng_c
,
rng_d
;
if
((
size_a
-
(
blk_idx_a
*
SIZE_SLICE_1_A
))
>=
SIZE_SLICE_1_A
)
{
rng_a
=
SIZE_SLICE_1_A
;
}
else
{
rng_a
=
size_a
%
SIZE_SLICE_1_A
;
}
if
((
size_b
-
(
blk_idx_b
*
SIZE_SLICE_1_B
))
>=
SIZE_SLICE_1_B
)
{
rng_b
=
SIZE_SLICE_1_B
;
}
else
{
rng_b
=
size_b
%
SIZE_SLICE_1_B
;
}
if
((
size_c
-
(
blk_idx_c
*
SIZE_SLICE_1_C
))
>=
SIZE_SLICE_1_C
)
{
rng_c
=
SIZE_SLICE_1_C
;
}
else
{
rng_c
=
size_c
%
SIZE_SLICE_1_C
;
}
if
((
size_d
-
(
blk_idx_d
*
SIZE_SLICE_1_D
))
>=
SIZE_SLICE_1_D
)
{
rng_d
=
SIZE_SLICE_1_D
;
}
else
{
rng_d
=
size_d
%
SIZE_SLICE_1_D
;
}
float
temp_av
;
float
temp_bv
[
6
];
float
reg_tile
[
6
][
6
];
for
(
int
i
=
0
;
i
<
6
;
i
++
)
for
(
int
j
=
0
;
j
<
6
;
j
++
)
reg_tile
[
i
][
j
]
=
0.0
;
// tensor contraction: [[16, 'STR_SD2_T2_H7', 'x', 't2', ['a', 'e', 'b', 'f']], [16, 'STR_SD2_V2_H7', 'y', 'v2', ['d', 'f', 'c', 'e']], '+=']
#pragma unroll 1
for
(
int
l
=
0
;
l
<
size_internal
;
l
+=
SIZE_INT_UNIT_1
)
{
// Part: Generalized Contraction Index (p7b)
internal_offset
=
(
l
+
SIZE_INT_UNIT_1
)
-
size_internal
;
if
(
internal_offset
>
0
)
internal_upperbound
=
internal_offset
;
//---------------------------------------------------------------------------------------------------
// This is for the new version
// This Part is for Loading Input-Left
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_a
&&
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
rng_b
;
ll
++
)
{
// ['a', 'e', 'b', 'f']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_a
sm_a
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_t2
[
blk_idx_a
*
SIZE_SLICE_1_A
+
idx_a
+
((
blk_idx_b
*
SIZE_SLICE_1_B
+
ll
)
*
size_e
)
*
size_a
+
dev_internal_offset_t2
[
threadIdx
.
y
+
l
]];
}
// This Part is for Loading Input-Right
// tc_gen_code_Kernel_Load_Inputs_Abstracts()
if
(
idx_a
<
rng_d
&&
threadIdx
.
y
<
SIZE_INT_UNIT_1
-
internal_upperbound
)
for
(
int
ll
=
0
;
ll
<
rng_c
;
ll
++
)
{
// ['d', 'f', 'c', 'e']
// Exception: Temp. version!: threadIdx.y + l
// Exception: Temp. version!: idx_a < rng_d
sm_b
[
threadIdx
.
y
][
threadIdx
.
x
+
ll
*
16
]
=
dev_v2
[
blk_idx_d
*
SIZE_SLICE_1_D
+
idx_a
+
((
blk_idx_c
*
SIZE_SLICE_1_C
+
ll
)
*
size_f
)
*
size_d
+
dev_internal_offset_v2
[
threadIdx
.
y
+
l
]];
}
__syncthreads
();
//---------------------------------------------------------------------------------------------------
// Part: Generalized Threads
for
(
int
ll
=
0
;
ll
<
SIZE_INT_UNIT_1
-
internal_upperbound
;
ll
++
)
{
temp_bv
[
0
]
=
sm_b
[
ll
][
idx_d
+
0
];
temp_bv
[
1
]
=
sm_b
[
ll
][
idx_d
+
16
];
temp_bv
[
2
]
=
sm_b
[
ll
][
idx_d
+
32
];
temp_bv
[
3
]
=
sm_b
[
ll
][
idx_d
+
48
];
temp_bv
[
4
]
=
sm_b
[
ll
][
idx_d
+
64
];
temp_bv
[
5
]
=
sm_b
[
ll
][
idx_d
+
80
];
for
(
int
xx
=
0
;
xx
<
6
;
xx
++
)
// (1)
{
temp_av
=
sm_a
[
ll
][
idx_a
+
(
xx
*
16
)];
reg_tile
[
0
][
xx
]
+=
temp_av
*
temp_bv
[
0
];
reg_tile
[
1
][
xx
]
+=
temp_av
*
temp_bv
[
1
];
reg_tile
[
2
][
xx
]
+=
temp_av
*
temp_bv
[
2
];
reg_tile
[
3
][
xx
]
+=
temp_av
*
temp_bv
[
3
];
reg_tile
[
4
][
xx
]
+=
temp_av
*
temp_bv
[
4
];
reg_tile
[
5
][
xx
]
+=
temp_av
*
temp_bv
[
5
];
}
}
__syncthreads
();
}
// Store Results (Registers) to Global Memory
// Part: Generalized Threads
// Part: Generalized Register-Tiling
if
(
idx_a
<
rng_a
&&
idx_d
<
rng_d
)
for
(
int
i
=
0
;
i
<
6
;
i
++
)
{
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
if
(
i
<
rng_c
&&
j
<
rng_b
)
{
dev_t3
[
t3_base_thread
+
(
i
*
stride_reg_y
)
+
(
j
*
stride_reg_x
)]
=
reg_tile
[
i
][
j
];
}
}
}
}
// written by tc_interface.tc_gen_code_interface_Header()
extern
"C"
void
sd_t_d2_fusion
(
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
float
*
t3
,
float
*
host_t2
,
float
*
host_v2
,
int
cond_kernel_1
,
int
opt_register_transpose
)
{
int
num_thread_blocks_kernel_1
;
float
*
dev_t3
;
float
*
dev_t2
;
float
*
dev_v2
;
int
*
host_internal_left_offset
;
int
*
host_internal_right_offset
;
num_thread_blocks_kernel_1
=
CEIL
(
size_a
,
SIZE_SLICE_1_A
)
*
CEIL
(
size_b
,
SIZE_SLICE_1_B
)
*
CEIL
(
size_c
,
SIZE_SLICE_1_C
)
*
CEIL
(
size_d
,
SIZE_SLICE_1_D
);
// hipMalloc()
hipMalloc
((
void
**
)
&
dev_t3
,
sizeof
(
float
)
*
size_a
*
size_b
*
size_c
*
size_d
);
hipMalloc
((
void
**
)
&
dev_t2
,
sizeof
(
float
)
*
size_f
*
size_b
*
size_e
*
size_a
);
hipMalloc
((
void
**
)
&
dev_v2
,
sizeof
(
float
)
*
size_e
*
size_c
*
size_f
*
size_d
);
// hipMemcpy()
hipMemcpy
(
dev_t3
,
t3
,
sizeof
(
float
)
*
size_a
*
size_b
*
size_c
*
size_d
,
hipMemcpyHostToDevice
);
hipMemcpy
(
dev_t2
,
host_t2
,
sizeof
(
float
)
*
size_f
*
size_b
*
size_e
*
size_a
,
hipMemcpyHostToDevice
);
hipMemcpy
(
dev_v2
,
host_v2
,
sizeof
(
float
)
*
size_e
*
size_c
*
size_f
*
size_d
,
hipMemcpyHostToDevice
);
// Related to Kernels
// There are 1 Basic Kernels
long
long
int
tmp_operations
=
(
long
long
int
)((
long
long
int
)(
size_a
*
size_b
*
size_c
*
size_d
)
*
size_e
)
*
size_f
;
printf
(
"========================================= fusedKernels =============================================
\n
"
);
printf
(
" Grid Size : %6d (1D)
\n
"
,
num_thread_blocks_kernel_1
);
printf
(
" Block-size : %2d, %2d (2D)
\n
"
,
SIZE_TB_1_X
,
SIZE_TB_1_Y
);
printf
(
" Reg.-size : %2d, %2d (2D)
\n
"
,
SIZE_REG_1_X
,
SIZE_REG_1_Y
);
printf
(
" A thread deals with (%d x %d) elements (basically)
\n
"
,
SIZE_TB_1_X
*
SIZE_REG_1_X
,
SIZE_TB_1_Y
*
SIZE_REG_1_Y
);
printf
(
" # of Operations: %lld
\n
"
,
tmp_operations
);
printf
(
"====================================================================================================
\n
"
);
dim3
gridsize_1
(
num_thread_blocks_kernel_1
);
dim3
blocksize_1
(
SIZE_TB_1_X
,
SIZE_TB_1_Y
);
int
stride_output_a
=
1
;
int
stride_output_b
=
stride_output_a
*
size_a
;
int
stride_output_c
=
stride_output_b
*
size_b
;
int
stride_output_d
=
stride_output_c
*
size_c
;
int
stride_reg_x_1
=
stride_output_b
;
int
stride_reg_y_1
=
stride_output_c
;
int
size_internal
=
size_e
*
size_f
;
// (manually) ['e', 'f']
host_internal_left_offset
=
(
int
*
)
malloc
(
sizeof
(
int
)
*
size_internal
);
host_internal_right_offset
=
(
int
*
)
malloc
(
sizeof
(
int
)
*
size_internal
);
for
(
int
idx_f
=
0
;
idx_f
<
size_f
;
idx_f
++
)
for
(
int
idx_e
=
0
;
idx_e
<
size_e
;
idx_e
++
)
{
host_internal_left_offset
[
idx_e
+
(
idx_f
)
*
size_e
]
=
(
idx_e
+
((
idx_f
)
*
size_b
)
*
size_e
)
*
size_a
;
host_internal_right_offset
[
idx_e
+
(
idx_f
)
*
size_e
]
=
(
idx_f
+
((
idx_e
)
*
size_c
)
*
size_f
)
*
size_d
;
}
hipMemcpyToSymbol
(
const_internal_t2_offset
,
host_internal_left_offset
,
sizeof
(
int
)
*
size_internal
);
hipMemcpyToSymbol
(
const_internal_v2_offset
,
host_internal_right_offset
,
sizeof
(
int
)
*
size_internal
);
int
*
dev_internal_offset_t2
;
int
*
dev_internal_offset_v2
;
// hipMalloc()
hipMalloc
((
void
**
)
&
dev_internal_offset_t2
,
sizeof
(
int
)
*
size_internal
);
hipMalloc
((
void
**
)
&
dev_internal_offset_v2
,
sizeof
(
int
)
*
size_internal
);
// hipMemcpy()
hipMemcpy
(
dev_internal_offset_t2
,
host_internal_left_offset
,
sizeof
(
int
)
*
size_internal
,
hipMemcpyHostToDevice
);
hipMemcpy
(
dev_internal_offset_v2
,
host_internal_right_offset
,
sizeof
(
int
)
*
size_internal
,
hipMemcpyHostToDevice
);
// Decision Tree for Kernel Types
// No Chance to Utilize the Register Transpose
if
(
size_a
%
SIZE_SLICE_1_A
==
0
&&
size_b
%
SIZE_SLICE_1_B
==
0
&&
size_c
%
SIZE_SLICE_1_C
==
0
&&
size_d
%
SIZE_SLICE_1_D
==
0
)
{
// [2] Extenral Index: Full
if
(
size_e
%
SIZE_SLICE_1_E
==
0
&&
size_f
%
SIZE_SLICE_1_F
==
0
)
{
// [3] Internal Index: Full
// >>> External: Full && Internal: Full
printf
(
"External: Full, Internal: Full
\n
"
);
if
(
size_internal
>
MAX_CONST_LEN
)
{
kernel__1_tex_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
dev_internal_offset_t2
,
dev_internal_offset_v2
,
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
else
{
kernel__1_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
}
else
{
// [4] Internal Index: Partial
// >>> External: Full && Internal: Partial
printf
(
"External: Full, Internal: Partial
\n
"
);
if
(
size_internal
>
MAX_CONST_LEN
)
{
kernel__2_tex_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
dev_internal_offset_t2
,
dev_internal_offset_v2
,
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
else
{
kernel__2_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
}
}
else
{
// [2] Extenral Index: Partial
if
(
size_e
%
SIZE_SLICE_1_E
==
0
&&
size_f
%
SIZE_SLICE_1_F
==
0
)
{
// [3] Internal Index: Full
// >>> External: Partial && Internal: Full
printf
(
"External: Partial, Internal: Full
\n
"
);
if
(
size_internal
>
MAX_CONST_LEN
)
{
kernel__3_tex_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
dev_internal_offset_t2
,
dev_internal_offset_v2
,
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
else
{
kernel__3_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
}
else
{
// [4] Internal Index: Partial
// >>> External: Partial && Internal: Partial
printf
(
"External: Partial, Internal: Partial
\n
"
);
if
(
size_internal
>
MAX_CONST_LEN
)
{
kernel__4_tex_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
dev_internal_offset_t2
,
dev_internal_offset_v2
,
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
else
{
kernel__4_1
<<<
gridsize_1
,
blocksize_1
>>>
(
dev_t3
,
dev_t2
,
dev_v2
,
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
CEIL
(
size_a
,
SIZE_SLICE_1_A
),
CEIL
(
size_b
,
SIZE_SLICE_1_B
),
CEIL
(
size_c
,
SIZE_SLICE_1_C
),
CEIL
(
size_d
,
SIZE_SLICE_1_D
),
stride_reg_x_1
,
stride_reg_y_1
,
size_internal
);
}
}
}
// Copy the Result from Device to Host
hipMemcpy
(
t3
,
dev_t3
,
sizeof
(
float
)
*
(
size_a
*
size_b
*
size_c
*
size_d
),
hipMemcpyDeviceToHost
);
// hipFree()
hipFree
(
dev_t3
);
hipFree
(
dev_t2
);
hipFree
(
dev_v2
);
// Shoule be Fixed
// HostFree
}
// This is written by tc_interface.tc_gen_code_interface()
// This Interface Should be Called to Run the Kernels
extern
"C"
void
sd_t_d2_fusion_
(
int
size_a
,
int
size_b
,
int
size_c
,
int
size_d
,
int
size_e
,
int
size_f
,
float
*
t3
,
float
*
t2
,
float
*
v2
,
int
cond_kernel_1
,
int
opt_register_transpose
)
{
// Pre-Processing for Split
// Based on Tile-Sizes and Problem-Size
// Currently, one index can be split into two indices
// Call An Application
sd_t_d2_fusion
(
size_a
,
size_b
,
size_c
,
size_d
,
size_e
,
size_f
,
t3
,
t2
,
v2
,
cond_kernel_1
,
opt_register_transpose
);
}
\ No newline at end of file
example/complex_contraction/Makefile
0 → 100644
View file @
f0a8ee84
4D_kernel
:
hipcc
-O3
--offload-arch
=
gfx90a main.cpp 4D_kernel.hpp
-o
$@
clean
:
rm
4D_kernel
\ No newline at end of file
example/complex_contraction/main.cpp
0 → 100644
View file @
f0a8ee84
//
// Sample Code:
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#define DEBUG_CORRECTNESS
//#define DEBUG_SIMPLE_CORRECTNESS
void
pre_Initializing_Input_Tensors
();
void
post_Correctness
();
//
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
//
int
main
(
int
argc
,
char
**
argv
)
{
// for sd2
float
*
host_C
,
*
host_C_chk
;
float
*
host_A
;
float
*
host_B
;
int
size_idx_a
,
size_idx_b
,
size_idx_c
,
size_idx_d
,
size_idx_e
,
size_idx_f
;
// Problem Size
size_idx_a
=
16
;
size_idx_b
=
16
;
size_idx_c
=
16
;
size_idx_d
=
16
;
size_idx_e
=
16
;
size_idx_f
=
16
;
//
if
(
argc
==
7
)
{
size_idx_a
=
atoi
(
argv
[
1
]);
size_idx_b
=
atoi
(
argv
[
2
]);
size_idx_c
=
atoi
(
argv
[
3
]);
size_idx_d
=
atoi
(
argv
[
4
]);
size_idx_e
=
atoi
(
argv
[
5
]);
size_idx_f
=
atoi
(
argv
[
6
]);
}
int
size_C
;
int
size_A
;
int
size_B
;
int
size_internal
;
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
size_internal
=
size_idx_e
*
size_idx_f
;
size_C
=
size_idx_a
*
size_idx_b
*
size_idx_c
*
size_idx_d
;
size_A
=
size_idx_a
*
size_idx_e
*
size_idx_b
*
size_idx_f
;
size_B
=
size_idx_d
*
size_idx_f
*
size_idx_c
*
size_idx_e
;
//
host_C
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size_C
);
host_C_chk
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size_C
);
host_A
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size_A
);
host_B
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size_B
);
printf
(
"==========================================================================================================
\n
"
);
printf
(
">>> abcd-aebf-dfce
\n
"
);
printf
(
">>> t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
\n
"
);
printf
(
">>> Problem Size (a,b,c,d) and (e,f): (%2d,%2d,%2d,%2d) and (%2d,%2d)
\n
"
,
size_idx_a
,
size_idx_b
,
size_idx_c
,
size_idx_d
,
size_idx_e
,
size_idx_f
);
printf
(
"==========================================================================================================
\n
"
);
// Initialze "1" Output and "2 x 9" Inputs
pre_Initializing_Input_Tensors
(
host_C
,
host_C_chk
,
size_C
,
host_A
,
size_A
,
host_B
,
size_B
);
// Run the Kernels
sd_t_d2_fusion_
(
size_idx_a
,
size_idx_b
,
size_idx_c
,
size_idx_d
,
size_idx_e
,
size_idx_f
,
host_C
,
host_A
,
host_B
,
1
,
-
1
);
#ifdef DEBUG_CORRECTNESS
// Correctness-Check
post_Correctness
(
host_C
,
host_C_chk
,
host_A
,
host_B
,
size_idx_a
,
size_idx_b
,
size_idx_c
,
size_idx_d
,
size_idx_e
,
size_idx_f
);
#endif
// Free
free
(
host_C
);
free
(
host_C_chk
);
free
(
host_A
);
free
(
host_B
);
return
0
;
}
// Initialize t3 (t3_temp), 9 t2 and 9 v2.
void
pre_Initializing_Input_Tensors
(
float
*
h_C
,
float
*
h_C_chk
,
int
size_C
,
float
*
h_A
,
int
size_A
,
float
*
h_B
,
int
size_B
)
{
// t3
int
i
,
j
;
for
(
i
=
0
;
i
<
size_C
;
i
++
)
{
h_C
[
i
]
=
0.0
;
h_C_chk
[
i
]
=
0.0
;
}
for
(
j
=
0
;
j
<
size_A
;
j
++
)
{
h_A
[
j
]
=
((
float
)
rand
()
/
RAND_MAX
);
}
for
(
j
=
0
;
j
<
size_B
;
j
++
)
{
h_B
[
j
]
=
((
float
)
rand
()
/
RAND_MAX
);
}
}
//
void
post_Correctness
(
float
*
h_C
,
float
*
h_C_chk
,
float
*
h_A
,
float
*
h_B
,
int
size_idx_a
,
int
size_idx_b
,
int
size_idx_c
,
int
size_idx_d
,
int
size_idx_e
,
int
size_idx_f
)
{
// t3 [a,16,b,16,c,16,d,16] += sum(e,16,f,16) * t2 [a,e,b,f] * v2 [d,f,c,e];
int
size_C
=
size_idx_a
*
size_idx_b
*
size_idx_c
*
size_idx_d
;
long
long
int
tmp_ops
=
0
;
int
ops
=
0
;
int
idx_a
,
idx_b
,
idx_c
,
idx_d
,
idx_e
,
idx_f
;
for
(
idx_a
=
0
;
idx_a
<
size_idx_a
;
idx_a
++
)
for
(
idx_b
=
0
;
idx_b
<
size_idx_b
;
idx_b
++
)
for
(
idx_c
=
0
;
idx_c
<
size_idx_c
;
idx_c
++
)
for
(
idx_d
=
0
;
idx_d
<
size_idx_d
;
idx_d
++
)
{
int
tmp_r_idx
=
idx_a
+
(
idx_b
+
(
idx_c
+
(
idx_d
)
*
size_idx_c
)
*
size_idx_b
)
*
size_idx_a
;
#ifdef DEBUG_SIMPLE_CORRECTNESS
if
(
tmp_r_idx
>
1024
)
break
;
#endif
for
(
idx_e
=
0
;
idx_e
<
size_idx_e
;
idx_e
++
,
ops
=
0
)
{
for
(
idx_f
=
0
;
idx_f
<
size_idx_f
;
idx_f
++
)
{
h_C_chk
[
tmp_r_idx
]
+=
h_A
[
idx_a
+
(
idx_e
+
(
idx_b
+
(
idx_f
)
*
size_idx_b
)
*
size_idx_e
)
*
size_idx_a
]
*
h_B
[
idx_d
+
(
idx_f
+
(
idx_c
+
(
idx_e
)
*
size_idx_c
)
*
size_idx_f
)
*
size_idx_d
];
ops
++
;
}
tmp_ops
=
tmp_ops
+
ops
;
}
}
printf
(
"======================================= Correctness Check ==========================================
\n
"
);
float
epsilon
=
0.00000001
;
int
diff
=
0
;
int
same
=
0
;
int
i
;
for
(
i
=
0
;
i
<
size_C
;
i
++
)
{
float
check
=
h_C_chk
[
i
]
-
h_C
[
i
];
if
(
check
<
0
)
check
*=
-
1
;
if
(
check
>
epsilon
)
{
diff
++
;
if
(
diff
<
8
)
printf
(
"Index: %5d, (Host) %8.4f, (Dev.) %8.4f >> (Diff.) %8.4f
\n
"
,
i
,
h_C_chk
[
i
],
h_C
[
i
],
check
);
}
else
{
same
++
;
}
}
printf
(
" >>> PASSED: %'10d among %'10d in t3
\n
"
,
same
,
size_C
);
printf
(
" >>> ERROR : %'10d among %'10d in t3
\n
"
,
diff
,
size_C
);
printf
(
" >>> Total Operations: %'lld
\n
"
,
tmp_ops
*
2
);
printf
(
"====================================================================================================
\n
"
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment