Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4227fce6
Commit
4227fce6
authored
Dec 01, 2021
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into gemm_activation
parents
c24b6de6
4041850f
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
285 additions
and
35 deletions
+285
-35
CMakeLists.txt
CMakeLists.txt
+1
-0
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+3
-3
profiler/conv_profiler.cpp
profiler/conv_profiler.cpp
+8
-8
profiler/gemm_profiler.cpp
profiler/gemm_profiler.cpp
+89
-22
script/profile_gemm.sh
script/profile_gemm.sh
+23
-2
test/CMakeLists.txt
test/CMakeLists.txt
+18
-0
test/magic_number_division/main.cpp
test/magic_number_division/main.cpp
+143
-0
No files found.
CMakeLists.txt
View file @
4227fce6
...
...
@@ -200,3 +200,4 @@ enable_cppcheck(
add_subdirectory
(
host
)
add_subdirectory
(
example
)
add_subdirectory
(
profiler
)
add_subdirectory
(
test
)
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
4227fce6
...
...
@@ -20,10 +20,10 @@
#define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC
1
#define USE_CONV_FWD_V6R1_NCHW
1
#define USE_CONV_FWD_V4R4R2_NHWC
0
#define USE_CONV_FWD_V6R1_NCHW
0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC
0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC
1
enum
ConvTensorLayout
{
...
...
profiler/conv_profiler.cpp
View file @
4227fce6
...
...
@@ -34,14 +34,14 @@ int conv_profiler(int argc, char* argv[])
{
if
(
argc
!=
25
)
{
printf
(
"arg1: tensor operation (conv
=
Convolution)
\n
"
);
printf
(
"arg2: data type (0
=
fp32
,
1
=
fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0
=
NCHW
,
1
=
NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0
=
KCYX
,
1
=
KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0
=
NKHW
,
1
=
NHWK)
\n
"
);
printf
(
"arg6: verification (0
=
no
,
1
=
yes)
\n
"
);
printf
(
"arg7: initialization (0
=
no init
,
1
=
integer value
,
2
=
decimal value)
\n
"
);
printf
(
"arg8: print
matrix
value (0
=
no
,
1
=
yes)
\n
"
);
printf
(
"arg1: tensor operation (conv
:
Convolution)
\n
"
);
printf
(
"arg2: data type (0
:
fp32
;
1
:
fp16)
\n
"
);
printf
(
"arg3: input tensor layout (0
:
NCHW
;
1
:
NHWC)
\n
"
);
printf
(
"arg4: weight tensor layout (0
:
KCYX
;
1
:
KYXC)
\n
"
);
printf
(
"arg5: output tensor layout (0
:
NKHW
;
1
:
NHWK)
\n
"
);
printf
(
"arg6: verification (0
:
no
;
1
:
yes)
\n
"
);
printf
(
"arg7: initialization (0
:
no init
;
1
:
integer value
;
2
:
decimal value)
\n
"
);
printf
(
"arg8: print
tensor
value (0
:
no
;
1
:
yes)
\n
"
);
printf
(
"arg9: run kernel # of times (>1)
\n
"
);
printf
(
"arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
...
...
profiler/gemm_profiler.cpp
View file @
4227fce6
...
...
@@ -37,12 +37,15 @@ int gemm_profiler(int argc, char* argv[])
{
if
(
argc
!=
14
)
{
printf
(
"arg1: tensor operation (gemm=GEMM)
\n
"
);
printf
(
"arg2: data type (0=fp32, 1=fp16)
\n
"
);
printf
(
"arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)
\n
"
);
printf
(
"arg4: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg5: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg6: print matrix value (0=no, 1=yes)
\n
"
);
printf
(
"arg1: tensor operation (gemm: GEMM)
\n
"
);
printf
(
"arg2: data type (0: fp32; 1: fp16)
\n
"
);
printf
(
"arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 1: A[m, k] * B[n, k] = C[m, n];
\n
"
);
printf
(
" 2: A[k, n] * B[k, n] = C[m, n];
\n
"
);
printf
(
" 3: A[k, n] * B[n, k] = C[m, n])
\n
"
);
printf
(
"arg4: verification (0: no; 1: yes)
\n
"
);
printf
(
"arg5: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
);
printf
(
"arg8: print tensor value (0: no; 1: yes)
\n
"
);
printf
(
"arg7: run kernel # of times (>1)
\n
"
);
printf
(
"arg8 to 13: M, N, K, StrideA, StrideB, StrideC
\n
"
);
exit
(
1
);
...
...
@@ -70,8 +73,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
...
...
@@ -80,8 +91,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
...
...
@@ -90,8 +109,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F16_F16_F16
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -100,8 +127,16 @@ int gemm_profiler(int argc, char* argv[])
ck
::
half_t
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_KN_MN
)
{
...
...
@@ -110,8 +145,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
MK_NK_MN
)
{
...
...
@@ -120,8 +163,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
K
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_KN_MN
)
{
...
...
@@ -130,8 +181,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
N
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
if
(
data_type
==
GemmDataType
::
F32_F32_F32
&&
layout
==
GemmMatrixLayout
::
KM_NK_MN
)
{
...
...
@@ -140,8 +199,16 @@ int gemm_profiler(int argc, char* argv[])
float
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
ColumnMajor
,
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
);
ck
::
tensor_layout
::
gemm
::
RowMajor
>
(
do_verification
,
init_method
,
do_log
,
nrepeat
,
M
,
N
,
K
,
(
StrideA
<
0
)
?
M
:
StrideA
,
(
StrideB
<
0
)
?
K
:
StrideB
,
(
StrideC
<
0
)
?
N
:
StrideC
);
}
else
{
...
...
script/profile_gemm.sh
View file @
4227fce6
...
...
@@ -18,7 +18,28 @@ REPEAT=$7
######## op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 256 256 256 256 256 256
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 960 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1920 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096 4096 4096 4096
#
$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 3840 4096 4096 4096 4096 4096
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 7680 8192 8192 8192 8192 8192
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 1024 1024 1024 1024 1024 1024
#$DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $REPEAT 2048 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
960 1024 1024
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1920 2048 2048
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
3840 4096 4096
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
7680 8192 8192
-1
-1
-1
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1024 1024 1024
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2048 2048 2048
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4096 4096 4096
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8192 8192 8192
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1056 1056 1056
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2080 2080 2080
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4128 4128 4128
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8224 8224 8224
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
1024 1024 1024 1088 1088 1088
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
2048 2048 2048 2112 2112 2112
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
4096 4096 4096 4160 4160 4160
$DRIVER
$OP
$DATATYPE
$LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
8192 8192 8192 8256 8256 8256
test/CMakeLists.txt
0 → 100644
View file @
4227fce6
include_directories
(
BEFORE
include
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include
${
PROJECT_SOURCE_DIR
}
/host/device/include
${
PROJECT_SOURCE_DIR
}
/device_operation/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/problem_transform
${
PROJECT_SOURCE_DIR
}
/external/rocm/include
)
set
(
MAGIC_NUMBER_DIVISISON_SOURCE magic_number_division/main.cpp
)
add_executable
(
test_magic_number_division
${
MAGIC_NUMBER_DIVISISON_SOURCE
}
)
target_link_libraries
(
test_magic_number_division PRIVATE host_tensor
)
test/magic_number_division/main.cpp
0 → 100644
View file @
4227fce6
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
__global__
void
gpu_magic_number_division
(
uint32_t
magic_multiplier
,
uint32_t
magic_shift
,
const
int32_t
*
p_dividend
,
int32_t
*
p_result
,
uint64_t
num
)
{
uint64_t
global_thread_num
=
blockDim
.
x
*
gridDim
.
x
;
uint64_t
global_thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
uint64_t
data_id
=
global_thread_id
;
data_id
<
num
;
data_id
+=
global_thread_num
)
{
p_result
[
data_id
]
=
ck
::
MagicDivision
::
DoMagicDivision
(
p_dividend
[
data_id
],
magic_multiplier
,
magic_shift
);
}
}
__global__
void
gpu_naive_division
(
int32_t
divisor
,
const
int32_t
*
p_dividend
,
int32_t
*
p_result
,
uint64_t
num
)
{
uint64_t
global_thread_num
=
blockDim
.
x
*
gridDim
.
x
;
uint64_t
global_thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
uint64_t
data_id
=
global_thread_id
;
data_id
<
num
;
data_id
+=
global_thread_num
)
{
p_result
[
data_id
]
=
p_dividend
[
data_id
]
/
divisor
;
}
}
template
<
typename
T
>
T
check_error
(
const
std
::
vector
<
T
>&
ref
,
const
std
::
vector
<
T
>&
result
)
{
T
error
=
0
;
T
max_diff
=
0
;
T
ref_value
=
0
,
result_value
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
ref
.
size
();
++
i
)
{
T
diff
=
std
::
abs
(
ref
[
i
]
-
result
[
i
]);
error
+=
diff
;
if
(
max_diff
<
diff
)
{
max_diff
=
diff
;
ref_value
=
ref
[
i
];
result_value
=
result
[
i
];
}
}
return
max_diff
;
}
int
main
(
int
,
char
*
[])
{
uint64_t
num_divisor
=
4096
;
uint64_t
num_dividend
=
1L
<<
16
;
std
::
vector
<
int32_t
>
divisors_host
(
num_divisor
);
std
::
vector
<
int32_t
>
dividends_host
(
num_dividend
);
// generate divisor
for
(
uint64_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
divisors_host
[
i
]
=
i
+
1
;
}
// generate dividend
for
(
uint64_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
dividends_host
[
i
]
=
i
;
}
DeviceMem
dividends_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
DeviceMem
naive_result_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
DeviceMem
magic_result_dev_buf
(
sizeof
(
int32_t
)
*
num_dividend
);
std
::
vector
<
int32_t
>
naive_result_host
(
num_dividend
);
std
::
vector
<
int32_t
>
magic_result_host
(
num_dividend
);
dividends_dev_buf
.
ToDevice
(
dividends_host
.
data
());
bool
pass
=
true
;
for
(
std
::
size_t
i
=
0
;
i
<
num_divisor
;
++
i
)
{
// run naive division on GPU
gpu_naive_division
<<<
1024
,
256
>>>
(
divisors_host
[
i
],
static_cast
<
const
int32_t
*>
(
dividends_dev_buf
.
GetDeviceBuffer
()),
static_cast
<
int32_t
*>
(
naive_result_dev_buf
.
GetDeviceBuffer
()),
num_dividend
);
// calculate magic number
uint32_t
magic_multiplier
,
magic_shift
;
ck
::
tie
(
magic_multiplier
,
magic_shift
)
=
ck
::
MagicDivision
::
CalculateMagicNumbers
(
divisors_host
[
i
]);
// run magic division on GPU
gpu_magic_number_division
<<<
1024
,
256
>>>
(
magic_multiplier
,
magic_shift
,
static_cast
<
const
int32_t
*>
(
dividends_dev_buf
.
GetDeviceBuffer
()),
static_cast
<
int32_t
*>
(
magic_result_dev_buf
.
GetDeviceBuffer
()),
num_dividend
);
naive_result_dev_buf
.
FromDevice
(
naive_result_host
.
data
());
magic_result_dev_buf
.
FromDevice
(
magic_result_host
.
data
());
int32_t
max_diff
=
check_error
(
naive_result_host
,
magic_result_host
);
if
(
max_diff
!=
0
)
{
pass
=
false
;
continue
;
}
}
if
(
pass
)
{
std
::
cout
<<
"test magic number division: Pass"
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"test magic number division: Fail"
<<
std
::
endl
;
}
return
1
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment