Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b134b7d6
Commit
b134b7d6
authored
May 16, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
090ba885
9f71ff48
Changes
211
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
417 additions
and
379 deletions
+417
-379
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+57
-56
example/10_conv2d_bwd_data/CMakeLists.txt
example/10_conv2d_bwd_data/CMakeLists.txt
+1
-1
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+12
-8
example/11_conv2d_bwd_weight/CMakeLists.txt
example/11_conv2d_bwd_weight/CMakeLists.txt
+1
-1
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+11
-8
example/12_reduce/CMakeLists.txt
example/12_reduce/CMakeLists.txt
+1
-1
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+11
-11
example/13_pool2d_fwd/pool2d_fwd.cpp
example/13_pool2d_fwd/pool2d_fwd.cpp
+14
-12
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
...quant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+75
-58
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+12
-11
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+54
-53
example/17_convnd_bwd_data_xdl/CMakeLists.txt
example/17_convnd_bwd_data_xdl/CMakeLists.txt
+1
-1
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+59
-55
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
...e/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+58
-60
example/CMakeLists.txt
example/CMakeLists.txt
+11
-2
include/ck/config.hpp
include/ck/config.hpp
+5
-4
include/ck/hip_version.hpp.in
include/ck/hip_version.hpp.in
+0
-28
include/ck/options.hpp.in
include/ck/options.hpp.in
+3
-0
include/ck/stream_config.hpp
include/ck/stream_config.hpp
+10
-0
include/ck/tensor_description/tensor_descriptor_helper.hpp
include/ck/tensor_description/tensor_descriptor_helper.hpp
+21
-9
No files found.
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
b134b7d6
...
...
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
...
...
@@ -45,10 +45,10 @@ template <ck::index_t NumDimSpatial>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
<
// clang-format off
InDataType
,
//
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
AccDataType
,
//
AccDataType
,
//
InElementOp
,
// Input Elementwise Operation
WeiElementOp
,
// Weights Elementwise Operation
OutElementOp
,
// Output Elementwise Operation
...
...
@@ -112,7 +112,7 @@ void print_use_msg()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
...
...
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
...
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
...
...
@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
...
...
@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
params
.
N
_
,
params
.
K
_
,
params
.
C
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
...
...
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -304,18 +304,18 @@ int main(int argc, char* argv[])
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
host_output
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
switch
(
num_dim_spatial
)
...
...
@@ -340,4 +340,5 @@ int main(int argc, char* argv[])
}
}
}
return
0
;
}
example/10_conv2d_bwd_data/CMakeLists.txt
View file @
b134b7d6
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_
fwd_
util
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_util
)
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
b134b7d6
...
...
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// Conv shape
ck
::
index_t
N
=
128
;
...
...
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
19
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
K
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
...
...
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
}
example/11_conv2d_bwd_weight/CMakeLists.txt
View file @
b134b7d6
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_
fwd_
util
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_util
)
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
b134b7d6
...
...
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
do_log
=
0
;
int
split_k
=
4
;
...
...
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
}
...
...
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4: is show log (0=no, 1=yes)
\n
"
);
printf
(
"arg5: split-k
\n
"
);
printf
(
"arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
...
...
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return
1
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
<<
std
::
endl
;
}
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
}
example/12_reduce/CMakeLists.txt
View file @
b134b7d6
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
-D 16,64,32,960 -v 1 1 10
)
example/12_reduce/reduce_blockwise.cpp
View file @
b134b7d6
...
...
@@ -116,10 +116,9 @@ class SimpleAppArgs
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
float
>
scales
;
bool
do_verification
=
false
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
public:
void
show_usage
(
const
char
*
cmd
)
...
...
@@ -135,12 +134,12 @@ class SimpleAppArgs
std
::
cout
<<
"Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
number of repeats to run the kernel
"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
time kernel (0=n0, 1=yes)
"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
unsigned
int
ch
;
int
ch
;
while
(
1
)
{
...
...
@@ -182,7 +181,7 @@ class SimpleAppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
nrepeat
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
std
::
atoi
(
argv
[
optind
]);
if
(
scales
.
empty
())
{
...
...
@@ -352,7 +351,7 @@ int main(int argc, char* argv[])
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
args
.
nrepeat
);
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
args
.
time_kernel
}
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
...
...
@@ -362,16 +361,17 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
bool
pass
=
true
;
if
(
args
.
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
pass
&=
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
};
};
return
pass
?
0
:
1
;
}
example/13_pool2d_fwd/pool2d_fwd.cpp
View file @
b134b7d6
...
...
@@ -80,8 +80,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for
(
int
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
int
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
if
(
hi
>=
0
&&
hi
<
ck
::
type_convert
<
int
>
(
in
.
mDesc
.
GetLengths
()[
2
]
)
&&
wi
>=
0
&&
wi
<
ck
::
type_convert
<
int
>
(
in
.
mDesc
.
GetLengths
()[
3
])
)
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
...
...
@@ -149,9 +149,9 @@ int main(int argc, char* argv[])
{
using
namespace
ck
::
host_reduce
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
128
;
...
...
@@ -171,13 +171,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -196,7 +196,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
...
...
@@ -271,7 +271,7 @@ int main(int argc, char* argv[])
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
C
*
Ho
*
Wo
*
Y
*
X
;
...
...
@@ -285,6 +285,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
pool_host_verify
<
InDataType
,
...
...
@@ -302,14 +303,15 @@ int main(int argc, char* argv[])
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
if
constexpr
(
NeedIndices
)
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
mData
.
data
());
//
ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
//
out_indices_n_c_ho_wo_host.mData);
;
pass
&=
ck
::
utils
::
check_err
(
out_indices_n_c_ho_wo_device
.
mData
,
out_indices_n_c_ho_wo_host
.
mData
);
};
}
return
pass
?
0
:
1
;
}
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
View file @
b134b7d6
...
...
@@ -13,74 +13,91 @@
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
#include "device_gemm_xdl.hpp"
#include "device_gemm_xdl_c_shuffle.hpp"
#include "device_gemm_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
struct
RequantReluRequant
{
// FIXME: We just need one scale for Relu / Leaky Relu / PRelu
RequantReluRequant
(
float
scaleGemm
,
float
scaleRelu
)
:
scaleGemm_
(
scaleGemm
),
scaleRelu_
(
scaleRelu
)
{
}
using
F32
=
float
;
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
float
gemm_requant
=
scaleGemm_
*
x
;
float
relu
=
gemm_requant
>
0
?
gemm_requant
:
0
;
float
relu_requant
=
scaleRelu_
*
relu
;
y
=
relu_requant
>
127
?
127
:
relu_requant
<
-
128
?
-
128
:
relu_requant
;
}
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
float
scaleGemm_
;
float
scaleRelu_
;
};
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
RequantReluRequant
=
ck
::
tensor_operation
::
element_wise
::
RequantReluRequant
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
using
CDataType
=
int8_t
;
using
AccDataType
=
int32_t
;
using
CShuffleDataType
=
int32_
t
;
using
CShuffleDataType
=
floa
t
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmXdl_C_Shuffle
<
ADataType
,
// ADataType
BDataType
,
// BDataType
CDataType
,
// CDataType
AccDataType
,
// AccDataType
CShuffleDataType
,
// CShuffleDataType
ALayout
,
// ALayout
BLayout
,
// BLayout
CLayout
,
// CLayout
PassThrough
,
// AElementwiseOperation
PassThrough
,
// BElementwiseOperation
RequantReluRequant
,
// CElementwiseOperation
256
,
// BlockSize
256
,
// MPerBlock
128
,
// NPerBlock
64
,
// KPerBlock
16
,
// AK1
16
,
// BK1
32
,
// MPerXDL
32
,
// NPerXDL
4
,
// MXdlPerWave
2
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
16
,
// ABlockTransferSrcScalarPerVector
16
,
// ABlockTransferDstScalarPerVector_K1
true
,
// ABlockLdsAddExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
16
,
// BBlockTransferSrcScalarPerVector
16
,
// BBlockTransferDstScalarPerVector_K1
true
,
// BBlockLdsAddExtraN
1
,
// CShuffleMXdlPerWavePerShuffle
1
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
1
,
64
,
1
,
1
,
4
>
,
// CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
16
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemm_Xdl_CShuffle
<
ALayout
,
// typename ALayout,
BLayout
,
// typename BLayout,
CLayout
,
// typename CLayout,
ADataType
,
// typename ADataType,
BDataType
,
// typename BDataType,
CDataType
,
// typename CDataType,
AccDataType
,
// typename GemmAccDataType,
CShuffleDataType
,
// typename CShuffleDataType,
PassThrough
,
// typename AElementwiseOperation,
PassThrough
,
// typename BElementwiseOperation,
RequantReluRequant
,
// typename CElementwiseOperation,
GemmDefault
,
// GemmSpecialization GemmSpec,
1
,
// index_t NumGemmKPrefetchStage,
256
,
// index_t BlockSize,
256
,
// index_t MPerBlock,
128
,
// index_t NPerBlock,
64
,
// index_t KPerBlock,
16
,
// index_t AK1,
16
,
// index_t BK1,
32
,
// index_t MPerXDL,
32
,
// index_t NPerXDL,
4
,
// index_t MXdlPerWave,
2
,
// index_t NXdlPerWave,
S
<
4
,
64
,
1
>
,
// typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
S
<
1
,
0
,
2
>
,
// typename ABlockTransferThreadClusterArrangeOrder,
S
<
1
,
0
,
2
>
,
// typename ABlockTransferSrcAccessOrder,
2
,
// index_t ABlockTransferSrcVectorDim,
16
,
// index_t ABlockTransferSrcScalarPerVector,
16
,
// index_t ABlockTransferDstScalarPerVector_AK1,
1
,
// bool ABlockLdsExtraM,
S
<
4
,
64
,
1
>
,
// typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
S
<
1
,
0
,
2
>
,
// typename BBlockTransferThreadClusterArrangeOrder,
S
<
1
,
0
,
2
>
,
// typename BBlockTransferSrcAccessOrder,
2
,
// index_t BBlockTransferSrcVectorDim,
8
,
// index_t BBlockTransferSrcScalarPerVector,
8
,
// index_t BBlockTransferDstScalarPerVector_BK1,
1
,
// bool BBlockLdsExtraN,
1
,
// index_t CShuffleMXdlPerWavePerShuffle,
1
,
// index_t CShuffleNXdlPerWavePerShuffle,
S
<
1
,
64
,
1
,
4
>
,
// typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
16
>
;
// index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
@@ -88,9 +105,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
3840
;
...
...
@@ -108,13 +125,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
10
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -128,7 +145,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
...
...
@@ -202,7 +219,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
...
...
@@ -227,7 +244,7 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
b134b7d6
...
...
@@ -60,21 +60,21 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
exit
(
0
);
}
...
...
@@ -131,7 +131,7 @@ int main(int argc, char* argv[])
std
::
size_t
flop
=
0
,
num_btype
=
0
;
for
(
in
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
a_tensors
.
push_back
(
Tensor
<
ADataType
>
(
f_host_tensor_descriptor
(
gemm_shapes
[
i
].
M
,
gemm_shapes
[
i
].
K
,
gemm_shapes
[
i
].
StrideA
,
ALayout
{})));
...
...
@@ -168,7 +168,7 @@ int main(int argc, char* argv[])
}
}
for
(
in
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_tensors
[
i
].
mDesc
.
GetElementSpace
()));
...
...
@@ -202,7 +202,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -211,9 +211,10 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
for
(
in
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
for
(
std
::
size_
t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
{
c_tensors_device
[
i
]
->
FromDevice
(
c_device_tensors
[
i
].
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
...
...
@@ -227,9 +228,9 @@ int main(int argc, char* argv[])
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ck
::
utils
::
check_err
(
c_device_tensors
[
i
].
mData
,
c_host_tensors
[
i
].
mData
);
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
].
mData
,
c_host_tensors
[
i
].
mData
);
}
}
return
0
;
return
pass
?
0
:
1
;
}
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
View file @
b134b7d6
...
...
@@ -4,6 +4,7 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -11,9 +12,10 @@
#include "device_tensor.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "
element_wise_
reduc
e
_operat
ion
.hpp"
#include "reduc
tion
_operat
or
.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
@@ -33,22 +35,23 @@ using ALayout = ck::tensor_layout::gemm::RowMajor;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSum
;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
float
,
float
,
false
>
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1|
D1EleOp|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
F32
,
AElementOp
,
BElementOp
,
CElementOp
,
D0ReduceOp
,
D1ReduceOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
F32
,
AElementOp
,
BElementOp
,
CElementOp
,
D0ReduceOp
,
D1ReduceOp
,
D1ElementOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
@@ -56,9 +59,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
1
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
3840
;
...
...
@@ -77,13 +80,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
10
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -97,7 +100,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
...
...
@@ -159,11 +162,10 @@ int main(int argc, char* argv[])
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
d0_reduce_op
=
D0ReduceOp
{};
auto
d1_reduce_op
=
D1ReduceOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
d1_element_op
=
D1ElementOp
{};
// do GEMM
auto
gemm
=
DeviceGemmReduceInstance
{};
...
...
@@ -182,8 +184,7 @@ int main(int argc, char* argv[])
a_element_op
,
b_element_op
,
c_element_op
,
d0_reduce_op
,
d1_reduce_op
);
d1_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
...
...
@@ -192,30 +193,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
// warm up
invoker
.
Run
(
argument
);
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker
.
Run
(
argument
);
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
float
ave_time
=
total_time
/
nrepeat
;
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
...
...
@@ -228,6 +212,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
...
...
@@ -242,25 +227,41 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
auto
d0_reduce_op
=
D0ReduceOp
{};
auto
d1_reduce_op
=
D1ReduceOp
{};
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
GetReduc
e
ZeroVal
ue
();
float
d1_acc
=
d1_reduce_op
.
GetReduc
e
ZeroVal
ue
();
float
d0_acc
=
d0_reduce_op
.
GetReduc
tion
ZeroVal
();
float
d1_acc
=
d1_reduce_op
.
GetReduc
tion
ZeroVal
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
d0_reduce_op
.
Reduce
(
d0_acc
,
c_m_n_host_result
(
m
,
n
));
d1_reduce_op
.
Reduce
(
d1_acc
,
c_m_n_host_result
(
m
,
n
));
float
d0_val
=
ck
::
type_convert
<
float
>
(
c_m_n_host_result
(
m
,
n
));
float
d1_val
;
d1_element_op
(
d1_val
,
d0_val
);
d0_reduce_op
(
d0_acc
,
d0_val
);
d1_reduce_op
(
d1_acc
,
d1_val
);
}
d0_m_host_result
(
m
)
=
d0_acc
;
d1_m_host_result
(
m
)
=
d1_acc
;
d0_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d0_acc
)
;
d1_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d1_acc
)
;
}
check_error
(
c_m_n_host_result
,
c_m_n_device_result
);
check_error
(
d0_m_host_result
,
d0_m_device_result
);
check_error
(
d1_m_host_result
,
d1_m_device_result
);
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
,
"Error: Incorrect results c"
);
pass
&=
ck
::
utils
::
check_err
(
d0_m_device_result
.
mData
,
d0_m_host_result
.
mData
,
"Error: Incorrect results d0"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
d1_m_device_result
.
mData
,
d1_m_host_result
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
);
}
return
0
;
return
pass
?
0
:
1
;
}
example/17_convnd_bwd_data_xdl/CMakeLists.txt
View file @
b134b7d6
add_example_executable
(
example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp
)
target_link_libraries
(
example_convnd_bwd_data_xdl PRIVATE conv_
fwd_
util
)
target_link_libraries
(
example_convnd_bwd_data_xdl PRIVATE conv_util
)
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
View file @
b134b7d6
...
...
@@ -6,7 +6,7 @@
#include <half.hpp>
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -87,7 +87,7 @@ void print_use_msg()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=random value, 2= init to 1 )
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -105,40 +105,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[])
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
...
...
@@ -165,25 +165,25 @@ DeviceConvBwdDataBasePtr get_conv_instance(int num_dim_spatial)
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
C
=
128
;
params
.
C
_
=
128
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
>
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
// check args number
int
conv_args
=
3
+
num_dim_spatial
*
6
;
...
...
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
exit
(
1
);
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
...
...
@@ -263,16 +263,16 @@ int main(int argc, char* argv[])
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
K
,
params
.
C
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
params
.
N
_
,
params
.
K
_
,
params
.
C
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
...
...
@@ -284,16 +284,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
ck
::
utils
::
conv
::
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -310,10 +310,10 @@ int main(int argc, char* argv[])
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in_n_c_hi_wi_host_result
,
wei_k_c_y_x
,
out_n_k_ho_wo
,
params
.
conv_filter_strides
,
params
.
conv_filter_dilations
,
params
.
input_left_pads
,
params
.
input_right_pads
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
_
,
params
.
input_right_pads
_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
...
...
@@ -322,7 +322,10 @@ int main(int argc, char* argv[])
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
check_error
(
in_n_c_hi_wi_host_result
,
in_n_c_hi_wi_device_result
);
return
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
)
?
0
:
1
;
};
switch
(
num_dim_spatial
)
...
...
@@ -347,4 +350,5 @@ int main(int argc, char* argv[])
}
}
}
return
0
;
}
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
View file @
b134b7d6
...
...
@@ -4,6 +4,7 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -11,9 +12,9 @@
#include "device_tensor.hpp"
#include "device_batched_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_batched_gemm.hpp"
#include "gemm_specialization.hpp"
#include "element_wise_reduce_operation.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
...
...
@@ -33,22 +34,23 @@ using ALayout = ck::tensor_layout::gemm::RowMajor;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSum
;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
float
,
float
,
false
>
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceBatchedGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D0| D1|
D1EleOp|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
F32
,
AElementOp
,
BElementOp
,
CElementOp
,
D0ReduceOp
,
D1ReduceOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
F32
,
AElementOp
,
BElementOp
,
CElementOp
,
D0ReduceOp
,
D1ReduceOp
,
D1ElementOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
...
...
@@ -56,18 +58,18 @@ using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
1
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
M
=
2048
;
ck
::
index_t
N
=
1920
;
ck
::
index_t
K
=
2048
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideC
=
4096
;
ck
::
index_t
StrideA
=
2048
;
ck
::
index_t
StrideB
=
2048
;
ck
::
index_t
StrideC
=
1920
;
ck
::
index_t
BatchCount
=
4
;
...
...
@@ -79,13 +81,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
11
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -95,13 +97,13 @@ int main(int argc, char* argv[])
StrideB
=
std
::
stoi
(
argv
[
8
]);
StrideC
=
std
::
stoi
(
argv
[
9
]);
BatchCount
=
std
::
stoi
(
argv
[
9
]);
BatchCount
=
std
::
stoi
(
argv
[
10
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount
\n
"
);
exit
(
0
);
}
...
...
@@ -168,11 +170,12 @@ int main(int argc, char* argv[])
a_device_buf
.
ToDevice
(
a_g_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_g_k_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
d0_reduce_op
=
D0ReduceOp
{};
auto
d1_reduce_op
=
D1ReduceOp
{};
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
d0_reduce_op
=
D0ReduceOp
{};
auto
d1_reduce_op
=
D1ReduceOp
{};
auto
d1_element_op
=
D1ElementOp
{};
// do GEMM
auto
batched_gemm
=
DeviceBatchedGemmReduceInstance
{};
...
...
@@ -192,8 +195,7 @@ int main(int argc, char* argv[])
a_element_op
,
b_element_op
,
c_element_op
,
d0_reduce_op
,
d1_reduce_op
,
d1_element_op
,
BatchCount
);
if
(
!
batched_gemm
.
IsSupportedArgument
(
argument
))
...
...
@@ -203,30 +205,13 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
// warm up
invoker
.
Run
(
argument
);
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker
.
Run
(
argument
);
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
float
ave_time
=
total_time
/
nrepeat
;
// if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
// will not be correct. need to set time_kernel = false for correctness test
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
BatchCount
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
BatchCount
*
M
*
K
+
...
...
@@ -240,6 +225,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
batched_gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_g_m_n_device_result
.
mData
.
data
());
...
...
@@ -258,24 +244,36 @@ int main(int argc, char* argv[])
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
GetReduc
e
ZeroVal
ue
();
float
d1_acc
=
d1_reduce_op
.
GetReduc
e
ZeroVal
ue
();
float
d0_acc
=
d0_reduce_op
.
GetReduc
tion
ZeroVal
();
float
d1_acc
=
d1_reduce_op
.
GetReduc
tion
ZeroVal
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
d0_reduce_op
.
Reduce
(
d0_acc
,
c_g_m_n_host_result
(
batch
,
m
,
n
));
d1_reduce_op
.
Reduce
(
d1_acc
,
c_g_m_n_host_result
(
batch
,
m
,
n
));
float
d0_val
=
ck
::
type_convert
<
float
>
(
c_g_m_n_host_result
(
batch
,
m
,
n
));
float
d1_val
;
d1_element_op
(
d1_val
,
d0_val
);
d0_reduce_op
(
d0_acc
,
d0_val
);
d1_reduce_op
(
d1_acc
,
d1_val
);
}
d0_g_m_host_result
(
batch
,
m
)
=
d0_acc
;
d1_g_m_host_result
(
batch
,
m
)
=
d1_acc
;
d0_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
DDataType
>
(
d0_acc
)
;
d1_g_m_host_result
(
batch
,
m
)
=
ck
::
type_convert
<
DDataType
>
(
d1_acc
)
;
}
}
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
check_error
(
d0_g_m_host_result
,
d0_g_m_device_result
);
check_error
(
d1_g_m_host_result
,
d1_g_m_device_result
);
pass
&=
ck
::
utils
::
check_err
(
c_g_m_n_host_result
.
mData
,
c_g_m_n_device_result
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
d0_g_m_device_result
.
mData
,
d0_g_m_host_result
.
mData
,
"Error: Incorrect results! D0"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
d1_g_m_device_result
.
mData
,
d1_g_m_host_result
.
mData
,
"Error: Incorrect results! D1"
,
1e-3
,
1e-3
);
}
return
0
;
return
pass
?
0
:
1
;
}
example/CMakeLists.txt
View file @
b134b7d6
...
...
@@ -19,9 +19,18 @@ include_directories(BEFORE
add_custom_target
(
examples
)
function
(
add_example_executable EXAMPLE_NAME
)
function
(
add_example_executable EXAMPLE_NAME
FILE_NAME
)
message
(
"adding example
${
EXAMPLE_NAME
}
"
)
add_executable
(
${
EXAMPLE_NAME
}
${
ARGN
}
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
target_link_libraries
(
${
EXAMPLE_NAME
}
PRIVATE host_tensor
)
add_test
(
NAME
${
EXAMPLE_NAME
}
COMMAND $<TARGET_FILE:
${
EXAMPLE_NAME
}
>
${
ARGN
}
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
add_dependencies
(
check
${
EXAMPLE_NAME
}
)
endfunction
(
add_example_executable EXAMPLE_NAME
)
function
(
add_example_executable_no_testing EXAMPLE_NAME FILE_NAME
)
message
(
"adding example
${
EXAMPLE_NAME
}
"
)
add_executable
(
${
EXAMPLE_NAME
}
${
FILE_NAME
}
)
target_link_libraries
(
${
EXAMPLE_NAME
}
PRIVATE host_tensor
)
add_dependencies
(
examples
${
EXAMPLE_NAME
}
)
endfunction
(
add_example_executable EXAMPLE_NAME
)
...
...
include/ck/config.hpp
View file @
b134b7d6
...
...
@@ -26,17 +26,14 @@
#endif
#endif
// buffer resour
se, wave siz
e
// buffer resour
c
e
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_GPU_WAVE_SIZE -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#define CK_GPU_WAVE_SIZE 64
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_GPU_WAVE_SIZE 32
#endif
// FMA instruction
...
...
@@ -112,6 +109,10 @@
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
// experimental feature: optimize for inter-wave scheduling policy
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
...
...
include/ck/hip_version.hpp.in
deleted
100644 → 0
View file @
090ba885
#pragma once
// "_PACKAGE_" to avoid name contentions: the macros like
// HIP_VERSION_MAJOR are defined in HIP_VERSION.h.
// clang-format off
#define CK_HIP_PACKAGE_VERSION_MAJOR @CK_HIP_VERSION_MAJOR@
#define CK_HIP_PACKAGE_VERSION_MINOR @CK_HIP_VERSION_MINOR@
#define CK_HIP_PACKAGE_VERSION_PATCH @CK_HIP_VERSION_PATCH@
// clang-format on
#ifndef CK_HIP_PACKAGE_VERSION_MAJOR
#define CK_HIP_PACKAGE_VERSION_MAJOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_MINOR
#define CK_HIP_PACKAGE_VERSION_MINOR 0
#endif
#ifndef CK_HIP_PACKAGE_VERSION_PATCH
#define CK_HIP_PACKAGE_VERSION_PATCH 0
#endif
// 3 decimal digits for major and minor, 6 digits for patch number.
// Max number is 999,999,999999 == 0xE8,D4A5,0FFF that fits into 64-bit math.
#if CK_HIP_PACKAGE_VERSION_MAJOR > 999 || CK_HIP_PACKAGE_VERSION_MAJOR > 999 || \
CK_HIP_PACKAGE_VERSION_PATCH > 999999
#error "Too big HIP version number(s)"
#endif
#define CK_HIP_PACKAGE_VERSION_FLAT \
((CK_HIP_PACKAGE_VERSION_MAJOR * 1000ULL + CK_HIP_PACKAGE_VERSION_MINOR) * 1000000 + \
CK_HIP_PACKAGE_VERSION_PATCH)
include/ck/options.hpp.in
0 → 100644
View file @
b134b7d6
#pragma once
#cmakedefine01 CK_TIME_KERNEL
include/ck/stream_config.hpp
0 → 100644
View file @
b134b7d6
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
struct
StreamConfig
{
hipStream_t
stream_id_
=
nullptr
;
bool
time_kernel_
=
false
;
};
include/ck/tensor_description/tensor_descriptor_helper.hpp
View file @
b134b7d6
#ifndef CK_TENSOR_DESCRIPTOR_HELPER_HPP
#define CK_TENSOR_DESCRIPTOR_HELPER_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "multi_index_transform_helper.hpp"
...
...
@@ -35,6 +33,12 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
}
#endif
// Lengths..., Strides... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template
<
typename
...
Lengths
,
typename
...
Strides
,
typename
enable_if
<
sizeof
...(
Lengths
)
==
sizeof
...(
Strides
),
bool
>
::
type
=
false
>
...
...
@@ -68,10 +72,10 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
}
};
const
auto
element_space_size
=
f
(
f
,
Number
<
0
>
{},
Number
<
1
>
{});
const
auto
element_space_size
=
f
(
f
,
Number
<
0
>
{},
Long
Number
<
1
>
{});
#else
const
auto
element_space_size
=
calculate_element_space_size_impl
(
lengths
,
strides
,
Number
<
0
>
{},
Number
<
1
>
{});
calculate_element_space_size_impl
(
lengths
,
strides
,
Number
<
0
>
{},
Long
Number
<
1
>
{});
#endif
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
...
...
@@ -82,9 +86,12 @@ __host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Leng
element_space_size
};
}
// Lengths... c
an
be:
// 1) index_t, which is known at run-time
// Lengths... c
ould
be:
// 1) index_t, which is known at run-time
, or
// 2) Number<>, which is known at compile-time
// element_space_size could be:
// 1) long_index_t, or
// 2) LongNumber<>
template
<
typename
...
Lengths
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_packed
(
const
Tuple
<
Lengths
...
>&
lengths
)
...
...
@@ -100,7 +107,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
constexpr
auto
visible_dim_hidden_ids
=
typename
arithmetic_sequence_gen
<
1
,
N
+
1
,
1
>::
type
{};
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Number
<
1
>
{});
const
auto
element_space_size
=
container_reduce
(
lengths
,
math
::
multiplies
{},
Long
Number
<
1
>
{});
return
TensorDescriptor
<
remove_cv_t
<
decltype
(
transforms
)
>
,
remove_cv_t
<
decltype
(
low_dim_hidden_idss
)
>
,
...
...
@@ -110,6 +117,12 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
element_space_size
};
}
// Lengths... could be:
// 1) index_t, which is known at run-time, or
// 2) Number<>, which is known at compile-time
// align could be:
// 1) index_t, or
// 2) Number<>
template
<
typename
...
Lengths
,
typename
Align
>
__host__
__device__
constexpr
auto
make_naive_tensor_descriptor_aligned
(
const
Tuple
<
Lengths
...
>&
lengths
,
Align
align
)
...
...
@@ -146,4 +159,3 @@ make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align ali
}
}
// namespace ck
#endif
Prev
1
2
3
4
5
6
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment