Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
68886f7d
"tasks/vscode:/vscode.git/clone" did not exist on "43c9137b94edcbaa2a9d1e3c671e938bac4cc937"
Commit
68886f7d
authored
Jun 14, 2022
by
raman jana
Browse files
merging with latest develop branch
parents
a9ee2960
1677cf70
Changes
328
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1657 additions
and
236 deletions
+1657
-236
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+6
-3
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+15
-17
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+19
-16
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+344
-0
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+14
-16
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+12
-8
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+11
-8
example/12_reduce/CMakeLists.txt
example/12_reduce/CMakeLists.txt
+1
-0
example/12_reduce/README.md
example/12_reduce/README.md
+28
-13
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+78
-124
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+290
-0
example/13_pool2d_fwd/CMakeLists.txt
example/13_pool2d_fwd/CMakeLists.txt
+3
-1
example/13_pool2d_fwd/README.md
example/13_pool2d_fwd/README.md
+27
-8
example/13_pool2d_fwd/pool2d_fwd_common.hpp
example/13_pool2d_fwd/pool2d_fwd_common.hpp
+281
-0
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+114
-0
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+114
-0
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
...quant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
+15
-10
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+17
-11
example/16_gemm_reduce/CMakeLists.txt
example/16_gemm_reduce/CMakeLists.txt
+2
-1
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+266
-0
No files found.
example/09_convnd_fwd/CMakeLists.txt
View file @
68886f7d
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl PRIVATE conv_util
)
add_example_executable
(
example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_util
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_fp64 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_fp32 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_fp16 PRIVATE conv_util
)
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
68886f7d
...
...
@@ -110,7 +110,7 @@ void print_use_msg()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
...
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
...
...
@@ -277,7 +277,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
...
...
@@ -291,7 +291,7 @@ int main(int argc, char* argv[])
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s
"
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s
, "
<<
conv
->
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
...
...
@@ -312,30 +312,28 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
switch
(
num_dim_spatial
)
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
return
0
;
}
example/09_convnd_fwd/convnd_fwd_xdl.cpp
→
example/09_convnd_fwd/convnd_fwd_xdl
_fp32
.cpp
View file @
68886f7d
...
...
@@ -107,7 +107,7 @@ void print_use_msg()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
...
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
...
...
@@ -276,7 +276,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
...
...
@@ -311,30 +311,33 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
switch
(
num_dim_spatial
)
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
return
0
;
}
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
0 → 100644
View file @
68886f7d
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "check_err.hpp"
#include "config.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace
{
using
InDataType
=
double
;
using
WeiDataType
=
double
;
using
OutDataType
=
double
;
using
AccDataType
=
double
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
using
DeviceConvFwdBasePtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
template
<
ck
::
index_t
NumDimSpatial
>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
<
// clang-format off
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
AccDataType
,
//
InElementOp
,
// Input Elementwise Operation
WeiElementOp
,
// Weights Elementwise Operation
OutElementOp
,
// Output Elementwise Operation
ConvFwdDefault
,
// ConvForwardSpecialization
NumDimSpatial
,
// NumDimSpatial
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
4
,
// K0PerBlock
2
,
// K1
16
,
// MPerXDL
16
,
// NPerXDL
4
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
2
,
// ABlockTransferSrcScalarPerVector
2
,
// ABlockTransferDstScalarPerVector_K1
true
,
// ABlockLdsAddExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
2
,
// BBlockTransferSrcScalarPerVector
2
,
// BBlockTransferDstScalarPerVector_K1
true
,
// BBlockTransferAddExtraN
7
,
// CThreadTransferSrcDstVectorDim
1
>
;
// CThreadTransferDstScalarPerVector
// clang-format on
template
<
ck
::
index_t
NumDimSpatial
>
using
ReferenceConvNDFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NumDimSpatial
>
;
DeviceConvFwdBasePtr
get_conv_instance
(
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
3
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
3
>>
();
}
case
2
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
2
>>
();
}
case
1
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
1
>>
();
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
void
print_use_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: run kernel # of times (>1)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
<<
" <input image spatial dimensions>, (ie Hi, Wi for 2D)
\n
"
<<
" <strides>, (ie Sy, Sx for 2D)
\n
"
<<
" <dilations>, (ie Dy, Dx for 2D)
\n
"
<<
" <left padding>, (ie LeftPy, LeftPx for 2D)
\n
"
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
}
ck
::
utils
::
conv
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
argc
,
char
*
argv
[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int
conv_args
=
3
+
num_dim_spatial
*
6
;
int
cmdline_nargs
=
conv_args
+
5
;
if
(
cmdline_nargs
!=
argc
)
{
print_use_msg
();
exit
(
0
);
}
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
N_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
}
}
// anonymous namespace
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
if
(
argc
>=
5
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
if
(
argc
>=
6
)
{
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N_
),
static_cast
<
std
::
size_t
>
(
params
.
C_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths_
),
std
::
end
(
params
.
input_spatial_lengths_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K_
),
static_cast
<
std
::
size_t
>
(
params
.
C_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths_
),
std
::
end
(
params
.
filter_spatial_lengths_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N_
),
static_cast
<
std
::
size_t
>
(
params
.
K_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
get_input_host_tensor_descriptor
(
input_dims
,
num_dim_spatial
));
Tensor
<
WeiDataType
>
weights
(
get_filters_host_tensor_descriptor
(
filter_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
host_output
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
device_output
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weights: "
<<
weights
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
weights
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
case
2
:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
weights
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
weights
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
device_output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
// do GEMM
auto
conv
=
get_conv_instance
(
num_dim_spatial
);
auto
invoker
=
conv
->
MakeInvokerPointer
();
auto
argument
=
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N_
,
params
.
K_
,
params
.
C_
,
params
.
input_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
if
(
!
conv
->
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
input_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
do_verification
)
{
auto
verify_f
=
[
&
input
,
&
weights
,
&
host_output
,
&
params
,
&
out_device_buf
,
&
device_output
](
const
auto
&
ref_conv
)
{
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
host_output
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
};
switch
(
num_dim_spatial
)
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
break
;
}
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
break
;
}
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
break
;
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
68886f7d
...
...
@@ -112,7 +112,7 @@ void print_use_msg()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
...
...
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
...
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
...
...
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
...
...
@@ -314,30 +314,28 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
return
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
switch
(
num_dim_spatial
)
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
break
;
return
verify_f
(
ref_conv
);
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
return
0
;
}
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
68886f7d
...
...
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// Conv shape
ck
::
index_t
N
=
128
;
...
...
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
19
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
K
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
...
...
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
}
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
68886f7d
...
...
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
int
do_log
=
0
;
int
split_k
=
4
;
...
...
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
}
...
...
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4: is show log (0=no, 1=yes)
\n
"
);
printf
(
"arg5: split-k
\n
"
);
printf
(
"arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
...
...
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return
1
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
...
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
<<
std
::
endl
;
}
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
}
example/12_reduce/CMakeLists.txt
View file @
68886f7d
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp
)
example/12_reduce/README.md
View file @
68886f7d
...
...
@@ -5,23 +5,38 @@
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2:
run
kernel
# of times (>1)
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
0
#arg2:
time
kernel
(0=no, 1=yes)
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
```
Result
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Warm up 1 time
Start running 10 times...
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
```
# Instructions for ```example_reduce_blockwise_two_call```
## Run ```example_reduce_blockwise_two_call```
```
bash
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise_two_call 1 2 1
Result
```
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 528, 528
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise
<
256,
M_C32_S1
,
K_C8_S1
,
InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1
>
=> DeviceReduceBlockWise
<
256,
M_C256_S1
,
K_C1_S1
,
InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1
>
```
example/12_reduce/reduce_blockwise.cpp
View file @
68886f7d
...
...
@@ -12,8 +12,8 @@
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_block
wise
.hpp"
#include "host_
reduce
_util.hpp"
#include "device_reduce_
multi
block.hpp"
#include "host_
common
_util.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
...
...
@@ -30,9 +30,8 @@ constexpr int Rank = 4;
constexpr
int
NumReduceDim
=
3
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
NORM2
;
constexpr
NanPropagation
NanOpt
=
NanPropagation
::
PROPAGATE_NAN
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
ReduceTensorIndices
IndicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
...
...
@@ -40,7 +39,7 @@ using InElementwiseOperation =
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstance
=
DeviceReduceBlock
Wise
<
InDataType
,
using
DeviceReduceInstance
=
DeviceReduce
Multi
Block
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
...
...
@@ -48,8 +47,10 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
false
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
4
,
64
,
...
...
@@ -60,66 +61,22 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
1
>
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
class
SimpleAppArgs
{
template
<
typename
T
>
static
T
getSingleValueFromString
(
const
std
::
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
ret
;
iss
>>
ret
;
return
(
ret
);
};
template
<
typename
T
>
static
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
};
private:
int
option_index
=
0
;
public:
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
float
>
scales
;
bool
do_verification
=
false
;
std
::
vector
<
size_t
>
inLengths
=
{
16
,
64
,
32
,
960
};
std
::
vector
<
float
>
scales
=
{
1.0
f
,
0.0
f
};
bool
do_verification
=
true
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
true
;
public:
void
show_usage
(
const
char
*
cmd
)
...
...
@@ -127,24 +84,24 @@ class SimpleAppArgs
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<<
std
::
endl
;
std
::
cout
<<
"Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
number of repeats to run the kernel
"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
time kernel (0=no, 1=yes)
"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
{
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
while
(
1
)
{
ch
=
getopt_long
(
argc
,
argv
,
"D:
S:
v:l:"
,
long_options
,
&
option_index
);
ch
=
getopt_long
(
argc
,
argv
,
"D:v:l:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
break
;
switch
(
ch
)
...
...
@@ -155,12 +112,6 @@ class SimpleAppArgs
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
case
'S'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
scales
=
getTypeValuesFromString
<
float
>
(
optarg
);
break
;
case
'v'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
...
...
@@ -182,7 +133,7 @@ class SimpleAppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
nrepeat
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
])
)
;
if
(
scales
.
empty
())
{
...
...
@@ -196,23 +147,21 @@ class SimpleAppArgs
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
host_reduce
;
const
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
};
const
std
::
vector
<
int
>
invariantDims
{
3
};
SimpleAppArgs
args
;
if
(
argc
>
1
)
{
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
};
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
// if input is half type, no reason to use float for indiced reduction operation and must use
// float for non-indiced reduction operation for accuracy
constexpr
bool
invalid_reduce_1
=
...
...
@@ -226,8 +175,7 @@ int main(int argc, char* argv[])
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// indices option can only be used when it is really needed
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
OutputIndex
);
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
);
...
...
@@ -295,39 +243,42 @@ int main(int argc, char* argv[])
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
DeviceMem
out_ind
ices
_dev
(
indicesSizeInBytes
);
DeviceMem
out_ind
ex
_dev
(
indicesSizeInBytes
);
if
(
args
.
do_verification
)
{
ReductionHost
<
InDataType
,
AccDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
)
;
const
auto
i_inStrides
=
to_int_vector
(
inStrides
)
;
const
auto
i_outLengths
=
to_int_vector
(
outLengths
)
;
const
auto
i_outStrides
=
to_int_vector
(
outStrides
)
;
std
::
vector
<
ck
::
index_t
>
i_
inLengths
;
std
::
vector
<
ck
::
index_t
>
i_
inStrides
;
std
::
vector
<
ck
::
index_t
>
i_
outLengths
;
std
::
vector
<
ck
::
index_t
>
i_
outStrides
;
auto
reduce
=
DeviceReduceInstance
{};
i_inLengths
.
assign
(
args
.
inLengths
.
begin
(),
args
.
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
wsSizeInBytes
=
reduce
.
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
DeviceMem
ws_dev
(
wsSizeInBytes
);
auto
reduce
=
DeviceReduceInstance
{};
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_inLengths
,
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_outLengths
,
i_outStrides
,
...
...
@@ -335,11 +286,11 @@ int main(int argc, char* argv[])
alpha
,
beta
,
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
InElementwiseOperation
{
static_cast
<
int
>
(
reduce_total_length
)},
AccElementwiseOperation
{
static_cast
<
int
>
(
reduce_total_length
)});
out_index_dev
.
GetDeviceBuffer
(),
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
...
...
@@ -352,7 +303,7 @@ int main(int argc, char* argv[])
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
args
.
nrepeat
);
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
args
.
time_kernel
}
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
...
...
@@ -362,16 +313,19 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
bool
pass
=
true
;
if
(
args
.
do_verification
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
};
};
return
(
pass
?
0
:
1
);
}
example/12_reduce/reduce_blockwise_two_call.cpp
0 → 100644
View file @
68886f7d
#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
InOutDataType
=
ck
::
half_t
;
using
InOutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
NORM2
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
AccDataType
,
AccDataType
>
;
using
DeviceReduceInstance_1
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
InOutDataType
,
5
,
// Rank
1
,
// NumReduceDim
ReduceOperation
,
InElementwiseOperation
,
PassThroughOp
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
32
,
8
,
1
,
1
,
1
,
// vector dim
1
,
1
>
;
using
DeviceReduceInstance_2
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
InOutDataType
,
4
,
// Rank
1
,
// NumReduceDim
ReduceOperation
,
PassThroughOp
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
128
,
2
,
1
,
1
,
1
,
// vector dim
1
,
1
>
;
static
bool
do_verify
;
static
int
init_method
;
static
float
alpha
;
static
float
beta
;
static
bool
time_kernel
;
int
main
(
int
argc
,
char
*
argv
[])
{
// used by the device reduction
const
std
::
vector
<
int
>
reduceDims_1
=
{
4
};
const
std
::
vector
<
int
>
invariantDims_1
=
{
0
,
1
,
2
,
3
};
const
std
::
vector
<
int
>
reduceDims_2
=
{
3
};
const
std
::
vector
<
int
>
invariantDims_2
=
{
0
,
1
,
2
};
// used by the host reduction
const
std
::
vector
<
int
>
reduceDims
=
{
3
,
4
};
const
std
::
vector
<
int
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
// input lengths of the second reduction, which is also the output lengths of the first
// reduction
const
std
::
vector
<
size_t
>
inLengths_2
=
{
64
,
320
,
80
,
4
};
const
std
::
vector
<
size_t
>
outLengths
=
{
64
,
320
,
80
};
if
(
argc
==
1
)
{
do_verify
=
true
;
init_method
=
2
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verify
=
static_cast
<
bool
>
(
argv
[
1
]);
init_method
=
atoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
atoi
(
argv
[
3
]));
}
else
{
std
::
ostringstream
ostr
;
ostr
<<
"Wrong parameter! "
<<
std
::
endl
<<
"Usage: "
<<
argv
[
0
]
<<
"[verify 0/1] init_method time_kernel"
<<
std
::
endl
;
throw
std
::
runtime_error
(
ostr
.
str
());
};
alpha
=
1.0
f
;
beta
=
0.0
f
;
Tensor
<
InOutDataType
>
in_1
(
inLengths_1
);
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
in_2
(
inLengths_2
);
// also the output tensor of the first reduction
Tensor
<
InOutDataType
>
out
(
outLengths
);
auto
inStrides_1
=
in_1
.
mDesc
.
GetStrides
();
auto
inStrides_2
=
in_2
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in_1
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
if
(
do_verify
)
{
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in_1
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
break
;
case
2
:
in_1
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
in_1
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
}
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpace
();
i
++
)
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
};
DeviceMem
in_1_dev
(
sizeof
(
InOutDataType
)
*
in_1
.
mDesc
.
GetElementSpace
());
DeviceMem
in_2_dev
(
sizeof
(
InOutDataType
)
*
in_2
.
mDesc
.
GetElementSpace
());
DeviceMem
out_dev
(
sizeof
(
InOutDataType
)
*
out
.
mDesc
.
GetElementSpace
());
in_1_dev
.
ToDevice
(
in_1
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
if
(
do_verify
)
{
ReductionHost
<
InOutDataType
,
AccDataType
,
InOutDataType
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
5
,
// Rank
2
,
// NumReduceDim
PropagateNan
,
OutputIndex
>
hostReduce
(
in_1
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
);
};
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_inLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
inLengths_1
.
end
());
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
inStrides_1
.
end
());
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
inLengths_2
.
end
());
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
inStrides_2
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_inLengths_1
,
i_inStrides_1
,
i_inLengths_2
,
i_inStrides_2
,
reduceDims_1
,
1.0
f
,
0.0
f
,
in_1_dev
.
GetDeviceBuffer
(),
nullptr
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
PassThroughOp
{});
if
(
!
reduce_1
.
IsSupportedArgument
(
argument_ptr_1
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
};
auto
invoker_ptr_1
=
reduce_1
.
MakeInvokerPointer
();
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_inLengths_2
,
i_inStrides_2
,
i_outLengths
,
i_outStrides
,
reduceDims_2
,
alpha
,
beta
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
nullptr
,
PassThroughOp
{},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce_2
.
IsSupportedArgument
(
argument_ptr_2
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
};
auto
invoker_ptr_2
=
reduce_2
.
MakeInvokerPointer
();
float
avg_time_1
=
invoker_ptr_1
->
Run
(
argument_ptr_1
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
avg_time_2
=
invoker_ptr_2
->
Run
(
argument_ptr_2
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InOutDataType
)
+
invariant_total_length
*
sizeof
(
InOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
(
avg_time_1
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
avg_time_1
+
avg_time_2
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_1
.
GetTypeString
()
<<
" => "
<<
reduce_2
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verify
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
};
return
(
pass
?
0
:
1
);
}
example/13_pool2d_fwd/CMakeLists.txt
View file @
68886f7d
add_example_executable
(
example_pool2d_fwd pool2d_fwd.cpp
)
add_example_executable
(
example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp
)
add_example_executable
(
example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp
)
example/13_pool2d_fwd/README.md
View file @
68886f7d
# Instructions for ```example_pool2d_fwd``` Example
# Instructions for ```example_pool2d_fwd``` Example
s
## Run ```example_pool2d_fwd```
## Run ```example_pool2d_fwd
_fp16
```
```
bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3:
run
kernel
# of times (>1
)
#arg3:
time
kernel
(0=no, 1=yes
)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd 1 1 1
0
./bin/example_pool2d_fwd
_fp16
1 1 1
```
Result
...
...
@@ -14,9 +14,28 @@ Result
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Warm up
1 time
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
error: 0
max_diff: 0, 1, 1
Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
```
## Run ```example_pool2d_fwd_fp32```
```
bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd_fp32 1 1 1
```
Result
```
./bin/example_pool2d_fwd_fp32 1 1 1
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
```
example/13_pool2d_fwd/pool2d_fwd
.c
pp
→
example/13_pool2d_fwd/pool2d_fwd
_common.h
pp
View file @
68886f7d
#pragma once
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
...
...
@@ -10,89 +8,67 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_reduce_util.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "reduction_operator.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
NeedIndices
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "reduction_functions_accumulate.hpp"
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
AccDataType
,
// AccDataType
ReduceOpId
,
NeedIndices
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
NeedIndices
>
bool
OutputIndex
>
static
void
pool_host_verify
(
const
Tensor
<
InDataType
>&
in
,
Tensor
<
OutDataType
>&
out
,
Tensor
<
int
>&
out_indices
,
Tensor
<
IndexDataType
>&
out_indices
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_spatial_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_strides
,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
{
using
namespace
ck
::
host_reduce
;
const
int32_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
]
;
const
int
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
const
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
const
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
const
InElementwiseOperation
in_elementwise_op
(
divider
);
const
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
!
NeedIndices
)
if
constexpr
(
!
OutputIndex
)
{
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
auto
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
for
(
in
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
in
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
in
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
ck
::
index_
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
ck
::
index_
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
in
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
ck
::
type_convert
<
in
t
>
(
in
.
mDesc
.
GetLengths
()[
2
])
&&
wi
>=
0
&&
wi
<
ck
::
type_convert
<
in
t
>
(
in
.
mDesc
.
GetLengths
()[
3
]))
ck
::
index_
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_
t
>
(
in
.
mDesc
.
GetLengths
()[
2
])
&&
wi
>=
0
&&
wi
<
static_cast
<
ck
::
index_
t
>
(
in
.
mDesc
.
GetLengths
()[
3
]))
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
};
...
...
@@ -105,33 +81,34 @@ static void pool_host_verify(const Tensor<InDataType>& in,
}
else
{
auto
opReduce
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
int
accuIndex
=
0
;
auto
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
for
(
in
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
in
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
in
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
ck
::
index_
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
ck
::
index_
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
in
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
ck
::
index_
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
int
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
IndexDataType
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
out_indices
(
n
,
c
,
ho
,
wo
)
=
accuIndex
;
...
...
@@ -145,62 +122,44 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
}
int
main
(
int
argc
,
char
*
argv
[])
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InLayout
,
typename
OutLayout
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
OutputIndex
>
bool
pool_test
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
namespace
ck
::
host_reduce
;
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
// Pool shape
ck
::
index_t
N
=
128
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: run kernel # of times (>1)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
AccDataType
,
// AccDataType
ReduceOpId
,
OutputIndex
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
...
...
@@ -228,9 +187,11 @@ int main(int argc, char* argv[])
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
int
>
out_indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
int
>
out_indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
...
...
@@ -245,17 +206,17 @@ int main(int argc, char* argv[])
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
DeviceMem
out_indices_device_buf
(
sizeof
(
int
)
*
DeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
auto
pool
=
DevicePoolFwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
int
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
N
,
C
,
std
::
array
<
ck
::
index_t
,
2
>
{{
Hi
,
Wi
}},
...
...
@@ -271,7 +232,7 @@ int main(int argc, char* argv[])
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
C
*
Ho
*
Wo
*
Y
*
X
;
...
...
@@ -285,14 +246,17 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
pool_host_verify
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
ReduceOpId
,
PropagateNan
,
NeedIndices
>
(
in_n_c_hi_wi
,
OutputIndex
>
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
out_indices_n_c_ho_wo_host
,
window_spatial_lengths
,
...
...
@@ -302,14 +266,16 @@ int main(int argc, char* argv[])
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
if
constexpr
(
NeedIndices
)
if
constexpr
(
OutputIndex
)
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
mData
.
data
());
//
ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
//
out_indices_n_c_ho_wo_host.mData);
;
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices_n_c_ho_wo_device
.
mData
,
out_indices_n_c_ho_wo_host
.
mData
);
};
}
}
return
(
pass
);
};
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
0 → 100644
View file @
68886f7d
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
// Pool shape
ck
::
index_t
N
=
128
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
do_verification
=
true
;
init_method
=
1
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
bool
pass
=
pool_test
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
0 → 100644
View file @
68886f7d
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
AccDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
// Pool shape
ck
::
index_t
N
=
128
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
do_verification
=
true
;
init_method
=
1
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
bool
pass
=
pool_test
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/14_gemm_xdl_requant_relu_requant/gemm_xdl_requant_relu_requant_int8.cpp
View file @
68886f7d
...
...
@@ -100,14 +100,19 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
16
>
;
// index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
RequantReluRequant
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
float
,
PassThrough
,
PassThrough
,
RequantReluRequant
>
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
3840
;
...
...
@@ -125,13 +130,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
10
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -145,7 +150,7 @@ int main(int argc, char* argv[])
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
\n
"
);
exit
(
0
);
}
...
...
@@ -219,7 +224,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
...
...
@@ -244,7 +249,7 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
...
...
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
68886f7d
...
...
@@ -56,29 +56,29 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
0
;
int
init_method
=
0
;
int
nrepeat
=
5
;
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
exit
(
0
);
}
int
group_count
=
4
;
int
group_count
=
rand
()
%
16
+
1
;
// GEMM shape
std
::
vector
<
ck
::
tensor_operation
::
device
::
GemmShape
>
gemm_shapes
;
...
...
@@ -189,12 +189,17 @@ int main(int argc, char* argv[])
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
// do GEMM
auto
argument
=
gemm
.
MakeArgument
(
p_a
,
p_b
,
p_c
,
gemm_shapes
,
a_element_op
,
b_element_op
,
c_element_op
);
DeviceMem
gemm_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
gemm
.
SetWorkSpacePointer
(
&
argument
,
gemm_desc_workspace
.
GetDeviceBuffer
());
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
...
...
@@ -202,7 +207,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
...
@@ -211,6 +216,7 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
for
(
std
::
size_t
i
=
0
;
i
<
gemm_shapes
.
size
();
i
++
)
...
...
@@ -227,9 +233,9 @@ int main(int argc, char* argv[])
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ck
::
utils
::
check_err
(
c_device_tensors
[
i
].
mData
,
c_host_tensors
[
i
].
mData
);
pass
&=
ck
::
utils
::
check_err
(
c_device_tensors
[
i
].
mData
,
c_host_tensors
[
i
].
mData
);
}
}
return
0
;
return
pass
?
0
:
1
;
}
example/16_gemm_reduce/CMakeLists.txt
View file @
68886f7d
add_example_executable
(
example_gemm_reduce_xdl_fp16 gemm_reduce_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_reduce_xdl_max_fp16 gemm_reduce_xdl_max_fp16.cpp
)
add_example_executable
(
example_gemm_reduce_xdl_mean_squaremean_fp16 gemm_reduce_xdl_mean_squaremean_fp16.cpp
)
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
→
example/16_gemm_reduce/gemm_reduce_xdl_
max_
fp16.cpp
View file @
68886f7d
...
...
@@ -3,7 +3,8 @@
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -11,16 +12,16 @@
#include "device_tensor.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reduction_operator.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "reduc
tion
_operat
or
.hpp"
#include "
element_wise_
reduc
e
_operat
ion
.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F64
=
double
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -28,7 +29,10 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
DDataType
=
F32
;
using
GemmAccDataType
=
F32
;
using
ReduceAccDataType
=
F32
;
using
DDataType
=
F64
;
using
DPtrsGlobal
=
ck
::
Tuple
<
DDataType
*>
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -37,30 +41,51 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
float
>
;
using
D1ElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
float
,
float
,
false
>
;
using
DsReduceOp
=
ck
::
Tuple
<
ck
::
reduce
::
Max
<
ReduceAccDataType
>>
;
using
DsElementOp
=
ck
::
Tuple
<
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>>
;
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicMax
>
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| D
0|
D
1
|
D1EleOp|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Reduce|
Reduce
| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | |
|
| Operation| Operation| Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | |
|
| | | |
|
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
F32
,
AElementOp
,
BElementOp
,
CElementOp
,
D
0
ReduceOp
,
D1Reduce
Op
,
D1
ElementOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle|
ReduceAcc|
DData| A| B| C| D
xs| DxsInEleOp| DxsAccEleOp|
D|
GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType|
DataType|
Type
Tuple
| Elementwise| Elementwise| Elementwise|
Reduce|
| | MemoryData
| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | |
|
| Operation| Operation| Operation|
Operation|
| |
Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | |
| |
| | | |
| |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
ReduceAccDataType
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
D
s
ReduceOp
,
DsElement
Op
,
Ds
ElementOp
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
GemmAccDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
DDataType
>
void
DumpGemmLayerNormPerf
(
float
gemm_reduce_time
,
int
M
,
int
N
,
int
K
)
{
std
::
size_t
gemm_flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
gemm_num_byte
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
DDataType
)
*
M
;
float
tflops
=
static_cast
<
float
>
(
gemm_flop
)
/
1.E9
/
gemm_reduce_time
;
float
gemm_gb_per_sec
=
gemm_num_byte
/
1.E6
/
gemm_reduce_time
;
std
::
cout
<<
"gemm + reduceMax Perf: "
<<
gemm_reduce_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gemm_gb_per_sec
<<
" GB/s, "
<<
std
::
endl
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
1
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
3840
;
...
...
@@ -79,13 +104,13 @@ int main(int argc, char* argv[])
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
10
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
...
...
@@ -122,22 +147,17 @@ int main(int argc, char* argv[])
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
DDataType
>
d0_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
DDataType
>
d1_m_host_result
(
Tensor
<
DDataType
>
d_m_host_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
CDataType
>
c_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
DDataType
>
d0_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
Tensor
<
DDataType
>
d1_m_device_result
(
Tensor
<
DDataType
>
d_m_device_result
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
M
)})));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d0_m: "
<<
d0_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m: "
<<
d1_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d_m: "
<<
d_m_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
...
...
@@ -155,8 +175,7 @@ int main(int argc, char* argv[])
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d0_device_buf
(
sizeof
(
DDataType
)
*
d0_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d1_device_buf
(
sizeof
(
DDataType
)
*
d1_m_device_result
.
mDesc
.
GetElementSpace
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_m_device_result
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
...
...
@@ -164,7 +183,8 @@ int main(int argc, char* argv[])
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
d1_element_op
=
D1ElementOp
{};
auto
ds_element_op
=
DsElementOp
{};
auto
p_ds_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
d_device_buf
.
GetDeviceBuffer
()));
// do GEMM
auto
gemm
=
DeviceGemmReduceInstance
{};
...
...
@@ -172,8 +192,7 @@ int main(int argc, char* argv[])
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
d1_device_buf
.
GetDeviceBuffer
()),
p_ds_global
,
M
,
N
,
K
,
...
...
@@ -183,7 +202,8 @@ int main(int argc, char* argv[])
a_element_op
,
b_element_op
,
c_element_op
,
d1_element_op
);
ds_element_op
,
ds_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
...
...
@@ -192,47 +212,17 @@ int main(int argc, char* argv[])
"not support this GEMM problem"
);
}
// warm up
invoker
.
Run
(
argument
);
// timing
float
total_time
=
0
;
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
// init DO, D1 to 0
d0_device_buf
.
SetZero
();
d1_device_buf
.
SetZero
();
KernelTimer
timer
;
timer
.
Start
();
invoker
.
Run
(
argument
);
timer
.
End
();
total_time
+=
timer
.
GetElapsedTime
();
}
float
ave_time
=
total_time
/
nrepeat
;
// [CAUSION]: launch_and_time_kernel will not initialize D.
// If we evaluate kernel multiple time but without initialize D. Verification will fail
d_device_buf
.
SetValue
(
ck
::
NumericLimits
<
DDataType
>::
Lowest
());
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
false
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_m_n_device_result
.
mData
.
data
());
d0_device_buf
.
FromDevice
(
d0_m_device_result
.
mData
.
data
());
d1_device_buf
.
FromDevice
(
d1_m_device_result
.
mData
.
data
());
d_device_buf
.
FromDevice
(
d_m_device_result
.
mData
.
data
());
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
...
...
@@ -242,32 +232,35 @@ int main(int argc, char* argv[])
ref_invoker
.
Run
(
ref_argument
);
auto
d0_reduce_op
=
D0ReduceOp
{};
auto
d1_reduce_op
=
D1ReduceOp
{};
auto
d_reduce_op
=
DsReduceOp
{}[
ck
::
Number
<
0
>
{}];
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
float
d0_acc
=
d0_reduce_op
.
GetReductionZeroVal
();
float
d1_acc
=
d1_reduce_op
.
GetReductionZeroVal
();
ReduceAccDataType
d_acc
=
d_reduce_op
.
GetIdentityValue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
float
d0_val
=
ck
::
type_convert
<
float
>
(
c_m_n_host_result
(
m
,
n
));
float
d1_val
;
d_reduce_op
(
d_acc
,
c_m_n_host_result
(
m
,
n
));
d1_element_op
(
d1_val
,
d0_val
);
d0_reduce_op
(
d0_acc
,
d0_val
);
d1_reduce_op
(
d1_acc
,
d1_val
);
d_m_host_result
(
m
)
=
d_acc
;
}
d0_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d0_acc
);
d1_m_host_result
(
m
)
=
ck
::
type_convert
<
DDataType
>
(
d1_acc
);
pass
=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
,
"Error: Incorrect results c"
)
&&
ck
::
utils
::
check_err
(
d_m_device_result
.
mData
,
d_m_host_result
.
mData
,
"Error: Incorrect results d"
,
1e-3
,
1e-3
);
}
check_error
(
c_m_n_host_result
,
c_m_n_device_result
);
check_error
(
d0_m_host_result
,
d0_m_device_result
);
check_error
(
d1_m_host_result
,
d1_m_device_result
);
if
(
time_kernel
)
{
float
gemm_reduceMax_ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
true
});
DumpGemmLayerNormPerf
<
ADataType
,
BDataType
,
CDataType
,
DDataType
>
(
gemm_reduceMax_ave_time
,
M
,
N
,
K
);
}
return
0
;
return
pass
?
0
:
1
;
}
Prev
1
2
3
4
5
6
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment