Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a3b4c5cb
Commit
a3b4c5cb
authored
Jun 03, 2022
by
wangshaojie6
Browse files
merge develop branch and add gridwise pipeline v3
parents
48918ab9
1677cf70
Changes
361
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1565 additions
and
420 deletions
+1565
-420
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+3
-2
example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
...conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+62
-61
example/09_convnd_fwd/CMakeLists.txt
example/09_convnd_fwd/CMakeLists.txt
+7
-4
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+61
-63
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+65
-62
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
+344
-0
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
+60
-62
example/10_conv2d_bwd_data/CMakeLists.txt
example/10_conv2d_bwd_data/CMakeLists.txt
+1
-1
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+12
-8
example/11_conv2d_bwd_weight/CMakeLists.txt
example/11_conv2d_bwd_weight/CMakeLists.txt
+1
-1
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
+12
-9
example/12_reduce/CMakeLists.txt
example/12_reduce/CMakeLists.txt
+1
-0
example/12_reduce/README.md
example/12_reduce/README.md
+28
-13
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+79
-125
example/12_reduce/reduce_blockwise_two_call.cpp
example/12_reduce/reduce_blockwise_two_call.cpp
+290
-0
example/13_pool2d_fwd/CMakeLists.txt
example/13_pool2d_fwd/CMakeLists.txt
+3
-1
example/13_pool2d_fwd/README.md
example/13_pool2d_fwd/README.md
+27
-8
example/13_pool2d_fwd/pool2d_fwd_common.hpp
example/13_pool2d_fwd/pool2d_fwd_common.hpp
+281
-0
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
+114
-0
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
+114
-0
No files found.
example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
# FIXME: should fix validation failure
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util
)
add_example_executable_no_testing
(
example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp
)
target_link_libraries
(
example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_util
)
example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
View file @
a3b4c5cb
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
...
@@ -90,7 +90,7 @@ void PrintUseMsg()
...
@@ -90,7 +90,7 @@ void PrintUseMsg()
{
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"Following arguments:
\n
"
<<
"Following arguments:
\n
"
<<
" N, K, C,
\n
"
<<
" N, K, C,
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
...
@@ -117,40 +117,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
...
@@ -117,40 +117,40 @@ ck::utils::conv::ConvParams ParseConvParams(int argc, char* argv[])
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
4
;
int
arg_idx
=
4
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
return
params
;
return
params
;
...
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
...
@@ -162,9 +162,9 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
::
utils
::
conv
;
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
const
int
num_dim_spatial
=
2
;
const
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
...
@@ -173,7 +173,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
}
if
(
argc
>=
5
)
if
(
argc
>=
5
)
...
@@ -181,21 +181,21 @@ int main(int argc, char* argv[])
...
@@ -181,21 +181,21 @@ int main(int argc, char* argv[])
params
=
ParseConvParams
(
argc
,
argv
);
params
=
ParseConvParams
(
argc
,
argv
);
}
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
std
::
end
(
output_spatial_lengths
));
...
@@ -209,7 +209,7 @@ int main(int argc, char* argv[])
...
@@ -209,7 +209,7 @@ int main(int argc, char* argv[])
// bias: assume contiguous 1d vector
// bias: assume contiguous 1d vector
Tensor
<
OutDataType
>
bias
(
Tensor
<
OutDataType
>
bias
(
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
params
.
K
)})));
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
static_cast
<
std
::
size_t
>
(
params
.
K
_
)})));
// residual: assume same layout as output tensor
// residual: assume same layout as output tensor
Tensor
<
OutDataType
>
residual
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
residual
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
...
@@ -224,10 +224,10 @@ int main(int argc, char* argv[])
...
@@ -224,10 +224,10 @@ int main(int argc, char* argv[])
{
{
case
0
:
break
;
case
0
:
break
;
case
1
:
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
2
,
2
});
weights
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
weights
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
2
,
2
});
bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
residual
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
5
,
5
});
residual
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
break
;
break
;
default:
default:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
...
@@ -259,16 +259,16 @@ int main(int argc, char* argv[])
...
@@ -259,16 +259,16 @@ int main(int argc, char* argv[])
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
bias_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
bias_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
resi_device_buf
.
GetDeviceBuffer
()),
static_cast
<
const
OutDataType
*>
(
resi_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
N
_
,
params
.
K
,
params
.
K
_
,
params
.
C
,
params
.
C
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
);
out_element_op
);
...
@@ -280,20 +280,20 @@ int main(int argc, char* argv[])
...
@@ -280,20 +280,20 @@ int main(int argc, char* argv[])
"not support this problem"
);
"not support this problem"
);
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
std
::
size_t
flop
=
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
_
,
params
.
C
,
params
.
C
_
,
params
.
K
,
params
.
K
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
)
+
output_spatial_lengths
)
+
sizeof
(
OutDataType
)
*
(
params
.
K
)
+
sizeof
(
OutDataType
)
*
(
params
.
K
_
)
+
sizeof
(
OutDataType
)
*
sizeof
(
OutDataType
)
*
(
params
.
N
*
params
.
K
*
output_spatial_lengths
[
0
]
*
output_spatial_lengths
[
1
]);
(
params
.
N
_
*
params
.
K
_
*
output_spatial_lengths
[
0
]
*
output_spatial_lengths
[
1
]);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
...
@@ -310,17 +310,18 @@ int main(int argc, char* argv[])
...
@@ -310,17 +310,18 @@ int main(int argc, char* argv[])
host_output
,
host_output
,
bias
,
bias
,
residual
,
residual
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
in_element_op
,
in_element_op
,
wei_element_op
,
wei_element_op
,
out_element_op
);
out_element_op
);
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
)
?
0
:
1
;
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
}
}
return
0
;
}
}
example/09_convnd_fwd/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_convnd_fwd_xdl convnd_fwd_xdl.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util
)
# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
add_example_executable_no_testing
(
example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp
)
target_link_libraries
(
example_convnd_fwd_xdl_fp64 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_fp32 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_int8 PRIVATE conv_util
)
target_link_libraries
(
example_convnd_fwd_xdl_fp16 PRIVATE conv_util
)
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
a3b4c5cb
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
...
@@ -110,7 +110,7 @@ void print_use_msg()
...
@@ -110,7 +110,7 @@ void print_use_msg()
{
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" N, K, C,
\n
"
...
@@ -137,40 +137,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
...
@@ -137,40 +137,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
return
params
;
return
params
;
...
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
...
@@ -182,9 +182,9 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
::
utils
::
conv
;
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
...
@@ -193,7 +193,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
}
...
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
...
@@ -202,21 +202,21 @@ int main(int argc, char* argv[])
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
std
::
end
(
output_spatial_lengths
));
...
@@ -256,16 +256,16 @@ int main(int argc, char* argv[])
...
@@ -256,16 +256,16 @@ int main(int argc, char* argv[])
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
N
_
,
params
.
K
,
params
.
K
_
,
params
.
C
,
params
.
C
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
...
@@ -277,21 +277,21 @@ int main(int argc, char* argv[])
...
@@ -277,21 +277,21 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
"not support this Conv problem"
);
}
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
std
::
size_t
flop
=
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
N
_
,
params
.
C
,
params
.
C
_
,
params
.
K
,
params
.
K
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s
"
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s
, "
<<
conv
->
GetTypeString
()
<<
std
::
endl
;
<<
std
::
endl
;
if
(
do_verification
)
if
(
do_verification
)
...
@@ -302,40 +302,38 @@ int main(int argc, char* argv[])
...
@@ -302,40 +302,38 @@ int main(int argc, char* argv[])
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
weights
,
host_output
,
host_output
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
};
switch
(
num_dim_spatial
)
switch
(
num_dim_spatial
)
{
{
case
3
:
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
2
:
{
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
1
:
{
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
default:
{
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
}
}
return
0
;
}
}
example/09_convnd_fwd/convnd_fwd_xdl.cpp
→
example/09_convnd_fwd/convnd_fwd_xdl
_fp32
.cpp
View file @
a3b4c5cb
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
...
@@ -107,7 +107,7 @@ void print_use_msg()
...
@@ -107,7 +107,7 @@ void print_use_msg()
{
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" N, K, C,
\n
"
...
@@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
...
@@ -134,40 +134,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
return
params
;
return
params
;
...
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
...
@@ -179,9 +179,9 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
::
utils
::
conv
;
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
...
@@ -190,7 +190,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
}
...
@@ -199,21 +199,21 @@ int main(int argc, char* argv[])
...
@@ -199,21 +199,21 @@ int main(int argc, char* argv[])
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
std
::
end
(
output_spatial_lengths
));
...
@@ -255,16 +255,16 @@ int main(int argc, char* argv[])
...
@@ -255,16 +255,16 @@ int main(int argc, char* argv[])
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
N
_
,
params
.
K
,
params
.
K
_
,
params
.
C
,
params
.
C
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
...
@@ -276,16 +276,16 @@ int main(int argc, char* argv[])
...
@@ -276,16 +276,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
"not support this Conv problem"
);
}
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
std
::
size_t
flop
=
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
_
,
params
.
C
,
params
.
C
_
,
params
.
K
,
params
.
K
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
@@ -301,40 +301,43 @@ int main(int argc, char* argv[])
...
@@ -301,40 +301,43 @@ int main(int argc, char* argv[])
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
weights
,
host_output
,
host_output
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
device_output
.
mData
,
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
host_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
};
switch
(
num_dim_spatial
)
switch
(
num_dim_spatial
)
{
{
case
3
:
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
2
:
{
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
1
:
{
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
default:
{
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
}
}
return
0
;
}
}
example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
0 → 100644
View file @
a3b4c5cb
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "check_err.hpp"
#include "config.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
namespace
{
using
InDataType
=
double
;
using
WeiDataType
=
double
;
using
OutDataType
=
double
;
using
AccDataType
=
double
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvFwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
using
DeviceConvFwdBasePtr
=
ck
::
tensor_operation
::
device
::
DeviceConvFwdPtr
<
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
template
<
ck
::
index_t
NumDimSpatial
>
using
DeviceConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
<
// clang-format off
InDataType
,
//
WeiDataType
,
//
OutDataType
,
//
AccDataType
,
//
InElementOp
,
// Input Elementwise Operation
WeiElementOp
,
// Weights Elementwise Operation
OutElementOp
,
// Output Elementwise Operation
ConvFwdDefault
,
// ConvForwardSpecialization
NumDimSpatial
,
// NumDimSpatial
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
4
,
// K0PerBlock
2
,
// K1
16
,
// MPerXDL
16
,
// NPerXDL
4
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_K0_M_K1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
2
,
// ABlockTransferSrcScalarPerVector
2
,
// ABlockTransferDstScalarPerVector_K1
true
,
// ABlockLdsAddExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_K0_N_K1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
2
,
// BBlockTransferSrcScalarPerVector
2
,
// BBlockTransferDstScalarPerVector_K1
true
,
// BBlockTransferAddExtraN
7
,
// CThreadTransferSrcDstVectorDim
1
>
;
// CThreadTransferDstScalarPerVector
// clang-format on
template
<
ck
::
index_t
NumDimSpatial
>
using
ReferenceConvNDFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
NumDimSpatial
>
;
DeviceConvFwdBasePtr
get_conv_instance
(
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
case
3
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
3
>>
();
}
case
2
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
2
>>
();
}
case
1
:
{
return
std
::
make_unique
<
DeviceConvNDFwdInstance
<
1
>>
();
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
void
print_use_msg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: run kernel # of times (>1)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" <filter spatial dimensions>, (ie Y, X for 2D)
\n
"
<<
" <input image spatial dimensions>, (ie Hi, Wi for 2D)
\n
"
<<
" <strides>, (ie Sy, Sx for 2D)
\n
"
<<
" <dilations>, (ie Dy, Dx for 2D)
\n
"
<<
" <left padding>, (ie LeftPy, LeftPx for 2D)
\n
"
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
}
ck
::
utils
::
conv
::
ConvParams
parse_conv_params
(
int
num_dim_spatial
,
int
argc
,
char
*
argv
[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int
conv_args
=
3
+
num_dim_spatial
*
6
;
int
cmdline_nargs
=
conv_args
+
5
;
if
(
cmdline_nargs
!=
argc
)
{
print_use_msg
();
exit
(
0
);
}
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
params
.
num_dim_spatial_
=
num_dim_spatial
;
params
.
N_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
filter_spatial_lengths_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_spatial_lengths_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_spatial_lengths_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_strides_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_strides_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
conv_filter_dilations_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
conv_filter_dilations_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_left_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_left_pads_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
params
.
input_right_pads_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
params
.
input_right_pads_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
return
params
;
}
}
// anonymous namespace
int
main
(
int
argc
,
char
*
argv
[])
{
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
int
init_method
=
0
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
if
(
argc
>=
5
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
if
(
argc
>=
6
)
{
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N_
),
static_cast
<
std
::
size_t
>
(
params
.
C_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths_
),
std
::
end
(
params
.
input_spatial_lengths_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K_
),
static_cast
<
std
::
size_t
>
(
params
.
C_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths_
),
std
::
end
(
params
.
filter_spatial_lengths_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N_
),
static_cast
<
std
::
size_t
>
(
params
.
K_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
input
(
get_input_host_tensor_descriptor
(
input_dims
,
num_dim_spatial
));
Tensor
<
WeiDataType
>
weights
(
get_filters_host_tensor_descriptor
(
filter_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
host_output
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
device_output
(
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
std
::
cout
<<
"input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"weights: "
<<
weights
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"output: "
<<
host_output
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
input
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
5
,
5
});
weights
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
5
,
5
});
break
;
case
2
:
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
0.0
,
1.0
});
weights
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.5
,
0.5
});
break
;
default:
input
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
1
});
weights
.
GenerateTensorValue
(
GeneratorTensor_1
<
WeiDataType
>
{
1
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
input
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
weights
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
device_output
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
input
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
weights
.
mData
.
data
());
// do GEMM
auto
conv
=
get_conv_instance
(
num_dim_spatial
);
auto
invoker
=
conv
->
MakeInvokerPointer
();
auto
argument
=
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N_
,
params
.
K_
,
params
.
C_
,
params
.
input_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
if
(
!
conv
->
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
get_flops
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N_
,
params
.
C_
,
params
.
K_
,
params
.
input_spatial_lengths_
,
params
.
filter_spatial_lengths_
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
if
(
do_verification
)
{
auto
verify_f
=
[
&
input
,
&
weights
,
&
host_output
,
&
params
,
&
out_device_buf
,
&
device_output
](
const
auto
&
ref_conv
)
{
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
host_output
,
params
.
conv_filter_strides_
,
params
.
conv_filter_dilations_
,
params
.
input_left_pads_
,
params
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
};
switch
(
num_dim_spatial
)
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
break
;
}
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
break
;
}
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
break
;
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
View file @
a3b4c5cb
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
#include "conv_
fwd_
util.hpp"
#include "conv_util.hpp"
#include "device.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
...
@@ -112,7 +112,7 @@ void print_use_msg()
...
@@ -112,7 +112,7 @@ void print_use_msg()
{
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3:
run
kernel
# of times (>1
)
\n
"
<<
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"arg4: N spatial dimensions (default 2)
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
"Following arguments (depending on number of spatial dims):
\n
"
<<
" N, K, C,
\n
"
<<
" N, K, C,
\n
"
...
@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
...
@@ -139,40 +139,40 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, int argc, cha
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
params
.
num_dim_spatial
_
=
num_dim_spatial
;
params
.
N
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
N
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
K
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
C
_
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
filter_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
filter_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
filter_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_spatial_lengths
.
resize
(
num_dim_spatial
);
params
.
input_spatial_lengths
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_spatial_lengths
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_spatial_lengths
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_strides
.
resize
(
num_dim_spatial
);
params
.
conv_filter_strides
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_strides
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_strides
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
conv_filter_dilations
.
resize
(
num_dim_spatial
);
params
.
conv_filter_dilations
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
conv_filter_dilations
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
conv_filter_dilations
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_left_pads
.
resize
(
num_dim_spatial
);
params
.
input_left_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_left_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_left_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
params
.
input_right_pads
.
resize
(
num_dim_spatial
);
params
.
input_right_pads
_
.
resize
(
num_dim_spatial
);
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
for
(
int
i
=
0
;
i
<
num_dim_spatial
;
++
i
)
{
{
params
.
input_right_pads
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
params
.
input_right_pads
_
[
i
]
=
std
::
stoi
(
argv
[
arg_idx
++
]);
}
}
return
params
;
return
params
;
...
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
...
@@ -184,9 +184,9 @@ int main(int argc, char* argv[])
{
{
using
namespace
ck
::
utils
::
conv
;
using
namespace
ck
::
utils
::
conv
;
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
int
num_dim_spatial
=
2
;
int
num_dim_spatial
=
2
;
ck
::
utils
::
conv
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
...
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
...
@@ -195,7 +195,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
}
}
...
@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
...
@@ -204,21 +204,21 @@ int main(int argc, char* argv[])
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
params
=
parse_conv_params
(
num_dim_spatial
,
argc
,
argv
);
}
}
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
input_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
input_dims
.
insert
(
std
::
end
(
input_dims
),
input_dims
.
insert
(
std
::
end
(
input_dims
),
std
::
begin
(
params
.
input_spatial_lengths
),
std
::
begin
(
params
.
input_spatial_lengths
_
),
std
::
end
(
params
.
input_spatial_lengths
));
std
::
end
(
params
.
input_spatial_lengths
_
));
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
),
std
::
vector
<
std
::
size_t
>
filter_dims
{
static_cast
<
std
::
size_t
>
(
params
.
K
_
),
static_cast
<
std
::
size_t
>
(
params
.
C
)};
static_cast
<
std
::
size_t
>
(
params
.
C
_
)};
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
filter_dims
.
insert
(
std
::
end
(
filter_dims
),
std
::
begin
(
params
.
filter_spatial_lengths
),
std
::
begin
(
params
.
filter_spatial_lengths
_
),
std
::
end
(
params
.
filter_spatial_lengths
));
std
::
end
(
params
.
filter_spatial_lengths
_
));
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
const
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
=
params
.
GetOutputSpatialLengths
();
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
),
std
::
vector
<
std
::
size_t
>
output_dims
{
static_cast
<
std
::
size_t
>
(
params
.
N
_
),
static_cast
<
std
::
size_t
>
(
params
.
K
)};
static_cast
<
std
::
size_t
>
(
params
.
K
_
)};
output_dims
.
insert
(
std
::
end
(
output_dims
),
output_dims
.
insert
(
std
::
end
(
output_dims
),
std
::
begin
(
output_spatial_lengths
),
std
::
begin
(
output_spatial_lengths
),
std
::
end
(
output_spatial_lengths
));
std
::
end
(
output_spatial_lengths
));
...
@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
...
@@ -258,16 +258,16 @@ int main(int argc, char* argv[])
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
WeiDataType
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
params
.
N
,
params
.
N
_
,
params
.
K
,
params
.
K
_
,
params
.
C
,
params
.
C
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
,
output_spatial_lengths
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
...
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
...
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
"not support this Conv problem"
);
}
}
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
get_flops
(
std
::
size_t
flop
=
get_flops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
params
.
N
_
,
params
.
C
_
,
params
.
K
_
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
std
::
size_t
num_btype
=
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
N
_
,
params
.
C
,
params
.
C
_
,
params
.
K
,
params
.
K
_
,
params
.
input_spatial_lengths
,
params
.
input_spatial_lengths
_
,
params
.
filter_spatial_lengths
,
params
.
filter_spatial_lengths
_
,
output_spatial_lengths
);
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
...
@@ -304,40 +304,38 @@ int main(int argc, char* argv[])
...
@@ -304,40 +304,38 @@ int main(int argc, char* argv[])
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
auto
ref_argument
=
ref_conv
.
MakeArgument
(
input
,
weights
,
weights
,
host_output
,
host_output
,
params
.
conv_filter_strides
,
params
.
conv_filter_strides
_
,
params
.
conv_filter_dilations
,
params
.
conv_filter_dilations
_
,
params
.
input_left_pads
,
params
.
input_left_pads
_
,
params
.
input_right_pads
,
params
.
input_right_pads
_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
out_device_buf
.
FromDevice
(
device_output
.
mData
.
data
());
ck
::
utils
::
check_err
(
return
ck
::
utils
::
check_err
(
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
);
host_output
.
mData
,
device_output
.
mData
,
"Error: incorrect results!"
,
1e-5
f
,
1e-4
f
)
?
0
:
1
;
};
};
switch
(
num_dim_spatial
)
switch
(
num_dim_spatial
)
{
{
case
3
:
{
case
3
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
3
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
2
:
{
case
2
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
2
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
case
1
:
{
case
1
:
{
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
auto
ref_conv
=
ReferenceConvNDFwdInstance
<
1
>
();
verify_f
(
ref_conv
);
return
verify_f
(
ref_conv
);
break
;
}
}
default:
{
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
}
}
}
return
0
;
}
}
example/10_conv2d_bwd_data/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_
fwd_
util
)
target_link_libraries
(
example_conv2d_bwd_data_xdl PRIVATE conv_util
)
example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
View file @
a3b4c5cb
...
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
...
@@ -77,9 +77,9 @@ using ReferenceConvBwdInstance = ck::tensor_operation::host::ReferenceConvBwdDat
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
// Conv shape
// Conv shape
ck
::
index_t
N
=
128
;
ck
::
index_t
N
=
128
;
...
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
...
@@ -102,13 +102,13 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
}
else
if
(
argc
==
19
)
else
if
(
argc
==
19
)
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
4
]);
K
=
std
::
stoi
(
argv
[
5
]);
K
=
std
::
stoi
(
argv
[
5
]);
...
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
...
@@ -130,7 +130,7 @@ int main(int argc, char* argv[])
{
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
"RightPx
\n
"
);
exit
(
0
);
exit
(
0
);
...
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
...
@@ -214,7 +214,7 @@ int main(int argc, char* argv[])
"not support this Conv problem"
);
"not support this Conv problem"
);
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
...
@@ -249,6 +249,10 @@ int main(int argc, char* argv[])
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
in_device_buf
.
FromDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
in_n_c_hi_wi_device_result
.
mData
,
in_n_c_hi_wi_host_result
.
mData
)
?
0
:
1
;
}
}
return
0
;
}
}
example/11_conv2d_bwd_weight/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
add_example_executable
(
example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_
fwd_
util
)
target_link_libraries
(
example_conv2d_bwd_weight_xdl PRIVATE conv_util
)
example/11_conv2d_bwd_weight/conv2d_bwd_weight_xdl.cpp
View file @
a3b4c5cb
...
@@ -68,7 +68,7 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
...
@@ -68,7 +68,7 @@ using DeviceConvBwdWeightInstance = ck::tensor_operation::device::
true
,
// BBlockLdsAddExtraN
true
,
// BBlockLdsAddExtraN
1
,
// CShuffleMXdlPerWavePerShuffle
1
,
// CShuffleMXdlPerWavePerShuffle
1
,
// CShuffleNXdlPerWavePerShuffle
1
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
32
,
1
,
4
>
,
// CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
S
<
1
,
32
,
1
,
8
>
,
// CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
8
>
;
// CBlockTransferScalarPerVector_NWaveNPerXdl
// clang-format on
// clang-format on
...
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
...
@@ -82,9 +82,9 @@ using ReferenceConvBwdWeightInstance =
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
bool
do_verification
=
0
;
bool
do_verification
=
true
;
int
init_method
=
0
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
false
;
int
do_log
=
0
;
int
do_log
=
0
;
int
split_k
=
4
;
int
split_k
=
4
;
...
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
...
@@ -109,7 +109,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
}
}
...
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
...
@@ -117,7 +117,7 @@ int main(int argc, char* argv[])
{
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
4
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
split_k
=
std
::
stoi
(
argv
[
5
]);
...
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
...
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
{
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3:
run
kernel
# of times (>1
)
\n
"
);
printf
(
"arg3:
time
kernel
(0=n0, 1=yes
)
\n
"
);
printf
(
"arg4: is show log (0=no, 1=yes)
\n
"
);
printf
(
"arg4: is show log (0=no, 1=yes)
\n
"
);
printf
(
"arg5: split-k
\n
"
);
printf
(
"arg5: split-k
\n
"
);
printf
(
"arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
printf
(
"arg6 to 19: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
...
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
...
@@ -246,7 +246,7 @@ int main(int argc, char* argv[])
return
1
;
return
1
;
}
}
float
ave_time
=
invoker
.
Run
(
argument
,
nrepeat
);
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
;
...
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
...
@@ -291,6 +291,9 @@ int main(int argc, char* argv[])
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
LogRangeAsType
<
float
>
(
std
::
cout
<<
"wei_host : "
,
wei_k_c_y_x_host_result
.
mData
,
","
)
<<
std
::
endl
;
<<
std
::
endl
;
}
}
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
);
return
ck
::
utils
::
check_err
(
wei_k_c_y_x_device_result
.
mData
,
wei_k_c_y_x_host_result
.
mData
)
?
0
:
1
;
}
}
return
0
;
}
}
example/12_reduce/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_blockwise reduce_blockwise.cpp
)
add_example_executable
(
example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp
)
example/12_reduce/README.md
View file @
a3b4c5cb
...
@@ -5,23 +5,38 @@
...
@@ -5,23 +5,38 @@
# -D <xxx> : input 4-d tensor lengths
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2:
run
kernel
# of times (>1)
#arg2:
time
kernel
(0=no, 1=yes)
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
0
./bin/example_reduce_blockwise
-D
16,64,32,960
-v
1 1 1
```
```
Result
Result
```
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Warm up 1 time
Start running 3 times...
Start running 10 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
```
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
# Instructions for ```example_reduce_blockwise_two_call```
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
## Run ```example_reduce_blockwise_two_call```
```
bash
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise_two_call 1 2 1
Result
```
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise
<
256,
M_C32_S1
,
K_C8_S1
,
InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1
>
=> DeviceReduceBlockWise
<
256,
M_C256_S1
,
K_C1_S1
,
InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1
>
error: 0
max_diff: 0, 528, 528
```
```
example/12_reduce/reduce_blockwise.cpp
View file @
a3b4c5cb
...
@@ -12,8 +12,8 @@
...
@@ -12,8 +12,8 @@
#include "host_tensor_generator.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_base.hpp"
#include "device_reduce_block
wise
.hpp"
#include "device_reduce_
multi
block.hpp"
#include "host_
reduce
_util.hpp"
#include "host_
common
_util.hpp"
#include "host_reduction.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
...
@@ -30,9 +30,8 @@ constexpr int Rank = 4;
...
@@ -30,9 +30,8 @@ constexpr int Rank = 4;
constexpr
int
NumReduceDim
=
3
;
constexpr
int
NumReduceDim
=
3
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
NORM2
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
NORM2
;
constexpr
NanPropagation
NanOpt
=
NanPropagation
::
PROPAGATE_NAN
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
PropagateNan
=
(
NanOpt
==
NanPropagation
::
NOT_PROPAGATE_NAN
)
?
false
:
true
;
constexpr
bool
OutputIndex
=
false
;
constexpr
ReduceTensorIndices
IndicesOpt
=
ReduceTensorIndices
::
NO_INDICES
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
using
InElementwiseOperation
=
...
@@ -40,7 +39,7 @@ using InElementwiseOperation =
...
@@ -40,7 +39,7 @@ using InElementwiseOperation =
using
AccElementwiseOperation
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
DeviceReduceInstance
=
DeviceReduceBlock
Wise
<
InDataType
,
using
DeviceReduceInstance
=
DeviceReduce
Multi
Block
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
Rank
,
Rank
,
...
@@ -48,8 +47,10 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
...
@@ -48,8 +47,10 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
ReduceOperation
,
ReduceOperation
,
InElementwiseOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
PropagateNan
,
false
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
256
,
4
,
4
,
64
,
64
,
...
@@ -60,66 +61,22 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
...
@@ -60,66 +61,22 @@ using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
1
>
;
1
>
;
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
static
struct
option
long_options
[]
=
{{
"inLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"scales"
,
required_argument
,
nullptr
,
'S'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
{
nullptr
,
0
,
nullptr
,
0
}};
{
nullptr
,
0
,
nullptr
,
0
}};
class
SimpleAppArgs
class
SimpleAppArgs
{
{
template
<
typename
T
>
static
T
getSingleValueFromString
(
const
std
::
string
&
valueStr
)
{
std
::
istringstream
iss
(
valueStr
);
T
ret
;
iss
>>
ret
;
return
(
ret
);
};
template
<
typename
T
>
static
std
::
vector
<
T
>
getTypeValuesFromString
(
const
char
*
cstr_values
)
{
std
::
string
valuesStr
(
cstr_values
);
std
::
vector
<
T
>
values
;
std
::
size_t
pos
=
0
;
std
::
size_t
new_pos
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
while
(
new_pos
!=
std
::
string
::
npos
)
{
const
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
,
new_pos
-
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
pos
=
new_pos
+
1
;
new_pos
=
valuesStr
.
find
(
','
,
pos
);
};
std
::
string
sliceStr
=
valuesStr
.
substr
(
pos
);
T
val
=
getSingleValueFromString
<
T
>
(
sliceStr
);
values
.
push_back
(
val
);
return
(
values
);
};
private:
private:
int
option_index
=
0
;
int
option_index
=
0
;
public:
public:
std
::
vector
<
size_t
>
inLengths
;
std
::
vector
<
size_t
>
inLengths
=
{
16
,
64
,
32
,
960
};
std
::
vector
<
float
>
scales
;
std
::
vector
<
float
>
scales
=
{
1.0
f
,
0.0
f
};
bool
do_verification
=
false
;
bool
do_verification
=
true
;
int
init_method
=
1
;
int
init_method
=
1
;
int
nrepeat
=
5
;
bool
time_kernel
=
true
;
public:
public:
void
show_usage
(
const
char
*
cmd
)
void
show_usage
(
const
char
*
cmd
)
...
@@ -127,24 +84,24 @@ class SimpleAppArgs
...
@@ -127,24 +84,24 @@ class SimpleAppArgs
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"Usage of "
<<
cmd
<<
std
::
endl
;
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
std
::
cout
<<
"--inLengths or -D, comma separated list of input tensor dimension lengths"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"--scales or -S, comma separated two float values for alpha and beta"
<<
std
::
endl
;
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
std
::
cout
<<
"--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
"comparing with the host-based reduction"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
std
::
cout
<<
"Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
"value, 3=decimal value)"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
number of repeats to run the kernel
"
<<
std
::
endl
;
std
::
cout
<<
"Arg2 --
time kernel (0=no, 1=yes)
"
<<
std
::
endl
;
};
};
int
processArgs
(
int
argc
,
char
*
argv
[])
int
processArgs
(
int
argc
,
char
*
argv
[])
{
{
unsigned
int
ch
;
using
ck
::
host_common
::
getTypeValuesFromString
;
int
ch
;
while
(
1
)
while
(
1
)
{
{
ch
=
getopt_long
(
argc
,
argv
,
"D:
S:
v:l:"
,
long_options
,
&
option_index
);
ch
=
getopt_long
(
argc
,
argv
,
"D:v:l:"
,
long_options
,
&
option_index
);
if
(
ch
==
-
1
)
if
(
ch
==
-
1
)
break
;
break
;
switch
(
ch
)
switch
(
ch
)
...
@@ -155,12 +112,6 @@ class SimpleAppArgs
...
@@ -155,12 +112,6 @@ class SimpleAppArgs
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
inLengths
=
getTypeValuesFromString
<
size_t
>
(
optarg
);
break
;
break
;
case
'S'
:
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
scales
=
getTypeValuesFromString
<
float
>
(
optarg
);
break
;
case
'v'
:
case
'v'
:
if
(
!
optarg
)
if
(
!
optarg
)
throw
std
::
runtime_error
(
"Invalid option format!"
);
throw
std
::
runtime_error
(
"Invalid option format!"
);
...
@@ -182,7 +133,7 @@ class SimpleAppArgs
...
@@ -182,7 +133,7 @@ class SimpleAppArgs
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
nrepeat
=
std
::
atoi
(
argv
[
optind
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
])
)
;
if
(
scales
.
empty
())
if
(
scales
.
empty
())
{
{
...
@@ -196,23 +147,21 @@ class SimpleAppArgs
...
@@ -196,23 +147,21 @@ class SimpleAppArgs
int
main
(
int
argc
,
char
*
argv
[])
int
main
(
int
argc
,
char
*
argv
[])
{
{
using
namespace
ck
::
host_reduce
;
const
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
};
const
std
::
vector
<
int
>
reduceDims
{
0
,
1
,
2
};
const
std
::
vector
<
int
>
invariantDims
{
3
};
const
std
::
vector
<
int
>
invariantDims
{
3
};
SimpleAppArgs
args
;
SimpleAppArgs
args
;
if
(
argc
>
1
)
{
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
if
(
args
.
processArgs
(
argc
,
argv
)
<
0
)
return
(
-
1
);
return
(
-
1
);
};
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
(
ReduceOpId
==
ReduceTensorOp
::
MIN
||
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
ReduceOpId
==
ReduceTensorOp
::
AMAX
);
constexpr
bool
NeedIndices
=
(
op_support_indices
&&
(
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
));
// if input is half type, no reason to use float for indiced reduction operation and must use
// if input is half type, no reason to use float for indiced reduction operation and must use
// float for non-indiced reduction operation for accuracy
// float for non-indiced reduction operation for accuracy
constexpr
bool
invalid_reduce_1
=
constexpr
bool
invalid_reduce_1
=
...
@@ -226,8 +175,7 @@ int main(int argc, char* argv[])
...
@@ -226,8 +175,7 @@ int main(int argc, char* argv[])
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
(
op_support_indices
&&
!
std
::
is_same
<
AccDataType
,
float
>::
value
);
// indices option can only be used when it is really needed
// indices option can only be used when it is really needed
constexpr
bool
invalid_reduce_3
=
constexpr
bool
invalid_reduce_3
=
(
!
op_support_indices
&&
OutputIndex
);
(
!
op_support_indices
&&
IndicesOpt
!=
ReduceTensorIndices
::
NO_INDICES
);
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
);
constexpr
bool
invalid_reduce
=
(
invalid_reduce_1
||
invalid_reduce_2
||
invalid_reduce_3
);
...
@@ -295,39 +243,42 @@ int main(int argc, char* argv[])
...
@@ -295,39 +243,42 @@ int main(int argc, char* argv[])
if
(
beta
!=
0.0
f
)
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
out_dev
.
ToDevice
(
out
.
mData
.
data
());
size_t
indicesSizeInBytes
=
NeedIndices
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
size_t
indicesSizeInBytes
=
OutputIndex
?
out
.
mDesc
.
GetElementSize
()
*
sizeof
(
int32_t
)
:
0
;
DeviceMem
out_ind
ices
_dev
(
indicesSizeInBytes
);
DeviceMem
out_ind
ex
_dev
(
indicesSizeInBytes
);
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
ReductionHost
<
InDataType
,
ReductionHost
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
NeedIndices
>
OutputIndex
>
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
(
in
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
hostReduce
.
Run
(
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
alpha
,
in
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
out_indices_ref
.
mData
.
data
());
};
};
const
auto
i_inLengths
=
to_int_vector
(
args
.
inLengths
)
;
std
::
vector
<
ck
::
index_t
>
i_
inLengths
;
const
auto
i_inStrides
=
to_int_vector
(
inStrides
)
;
std
::
vector
<
ck
::
index_t
>
i_
inStrides
;
const
auto
i_outLengths
=
to_int_vector
(
outLengths
)
;
std
::
vector
<
ck
::
index_t
>
i_
outLengths
;
const
auto
i_outStrides
=
to_int_vector
(
outStrides
)
;
std
::
vector
<
ck
::
index_t
>
i_
outStrides
;
auto
reduce
=
DeviceReduceInstance
{};
i_inLengths
.
assign
(
args
.
inLengths
.
begin
(),
args
.
inLengths
.
end
());
i_inStrides
.
assign
(
inStrides
.
begin
(),
inStrides
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
wsSizeInBytes
=
reduce
.
GetWorkspaceSizeInBytes
(
i_inLengths
,
reduceDims
);
auto
reduce
=
DeviceReduceInstance
{};
DeviceMem
ws_dev
(
wsSizeInBytes
);
auto
argument_ptr
=
auto
argument_ptr
=
reduce
.
MakeArgumentPointer
(
reduce
.
MakeArgumentPointer
(
i_inLengths
,
i_inLengths
,
i_inStrides
,
i_inStrides
,
i_outLengths
,
i_outLengths
,
i_outStrides
,
i_outStrides
,
...
@@ -335,11 +286,11 @@ int main(int argc, char* argv[])
...
@@ -335,11 +286,11 @@ int main(int argc, char* argv[])
alpha
,
alpha
,
beta
,
beta
,
in_dev
.
GetDeviceBuffer
(),
in_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
out_index_dev
.
GetDeviceBuffer
(),
ws_dev
.
GetDeviceBuffer
(),
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
InElementwiseOperation
{
static_cast
<
int
>
(
reduce_total_length
)},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
AccElementwiseOperation
{
static_cast
<
int
>
(
reduce_total_length
)});
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
reduce
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
...
@@ -352,7 +303,7 @@ int main(int argc, char* argv[])
...
@@ -352,7 +303,7 @@ int main(int argc, char* argv[])
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
args
.
nrepeat
);
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
args
.
time_kernel
}
);
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InDataType
)
+
invariant_total_length
*
sizeof
(
OutDataType
);
invariant_total_length
*
sizeof
(
OutDataType
);
...
@@ -362,16 +313,19 @@ int main(int argc, char* argv[])
...
@@ -362,16 +313,19 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_name
<<
std
::
endl
;
<<
std
::
endl
;
bool
pass
=
true
;
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
out_dev
.
FromDevice
(
out
.
mData
.
data
());
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
if
(
NeedIndices
)
if
(
OutputIndex
)
{
{
out_indices_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
out_index_dev
.
FromDevice
(
out_indices
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices
.
mData
,
out_indices_ref
.
mData
);
;
};
};
};
};
return
(
pass
?
0
:
1
);
}
}
example/12_reduce/reduce_blockwise_two_call.cpp
0 → 100644
View file @
a3b4c5cb
#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
using
namespace
ck
;
using
namespace
ck
::
tensor_operation
::
device
;
using
InOutDataType
=
ck
::
half_t
;
using
InOutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
constexpr
ReduceTensorOp
ReduceOpId
=
ReduceTensorOp
::
NORM2
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
AccDataType
,
AccDataType
>
;
using
DeviceReduceInstance_1
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
InOutDataType
,
5
,
// Rank
1
,
// NumReduceDim
ReduceOperation
,
InElementwiseOperation
,
PassThroughOp
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
32
,
8
,
1
,
1
,
1
,
// vector dim
1
,
1
>
;
using
DeviceReduceInstance_2
=
DeviceReduceMultiBlock
<
InOutDataType
,
AccDataType
,
InOutDataType
,
4
,
// Rank
1
,
// NumReduceDim
ReduceOperation
,
PassThroughOp
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
256
,
128
,
2
,
1
,
1
,
1
,
// vector dim
1
,
1
>
;
static
bool
do_verify
;
static
int
init_method
;
static
float
alpha
;
static
float
beta
;
static
bool
time_kernel
;
int
main
(
int
argc
,
char
*
argv
[])
{
// used by the device reduction
const
std
::
vector
<
int
>
reduceDims_1
=
{
4
};
const
std
::
vector
<
int
>
invariantDims_1
=
{
0
,
1
,
2
,
3
};
const
std
::
vector
<
int
>
reduceDims_2
=
{
3
};
const
std
::
vector
<
int
>
invariantDims_2
=
{
0
,
1
,
2
};
// used by the host reduction
const
std
::
vector
<
int
>
reduceDims
=
{
3
,
4
};
const
std
::
vector
<
int
>
invariantDims
=
{
0
,
1
,
2
};
const
std
::
vector
<
size_t
>
inLengths_1
=
{
64
,
320
,
80
,
4
,
128
};
// input lengths of the second reduction, which is also the output lengths of the first
// reduction
const
std
::
vector
<
size_t
>
inLengths_2
=
{
64
,
320
,
80
,
4
};
const
std
::
vector
<
size_t
>
outLengths
=
{
64
,
320
,
80
};
if
(
argc
==
1
)
{
do_verify
=
true
;
init_method
=
2
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verify
=
static_cast
<
bool
>
(
argv
[
1
]);
init_method
=
atoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
atoi
(
argv
[
3
]));
}
else
{
std
::
ostringstream
ostr
;
ostr
<<
"Wrong parameter! "
<<
std
::
endl
<<
"Usage: "
<<
argv
[
0
]
<<
"[verify 0/1] init_method time_kernel"
<<
std
::
endl
;
throw
std
::
runtime_error
(
ostr
.
str
());
};
alpha
=
1.0
f
;
beta
=
0.0
f
;
Tensor
<
InOutDataType
>
in_1
(
inLengths_1
);
Tensor
<
InOutDataType
>
out_ref
(
outLengths
);
Tensor
<
InOutDataType
>
in_2
(
inLengths_2
);
// also the output tensor of the first reduction
Tensor
<
InOutDataType
>
out
(
outLengths
);
auto
inStrides_1
=
in_1
.
mDesc
.
GetStrides
();
auto
inStrides_2
=
in_2
.
mDesc
.
GetStrides
();
auto
outStrides
=
out
.
mDesc
.
GetStrides
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in_1
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
1
;
if
(
do_verify
)
{
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in_1
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_1
<
InOutDataType
>
{
1
},
num_thread
);
break
;
case
2
:
in_1
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_2
<
InOutDataType
>
{
-
5
,
5
},
num_thread
);
break
;
default:
in_1
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
if
(
beta
!=
0.0
f
)
out_ref
.
GenerateTensorValue
(
GeneratorTensor_3
<
InOutDataType
>
{
-
5.0
,
5.0
},
num_thread
);
}
if
(
beta
!=
0.0
f
)
for
(
size_t
i
=
0
;
i
<
out_ref
.
mDesc
.
GetElementSpace
();
i
++
)
out
.
mData
[
i
]
=
out_ref
.
mData
[
i
];
};
DeviceMem
in_1_dev
(
sizeof
(
InOutDataType
)
*
in_1
.
mDesc
.
GetElementSpace
());
DeviceMem
in_2_dev
(
sizeof
(
InOutDataType
)
*
in_2
.
mDesc
.
GetElementSpace
());
DeviceMem
out_dev
(
sizeof
(
InOutDataType
)
*
out
.
mDesc
.
GetElementSpace
());
in_1_dev
.
ToDevice
(
in_1
.
mData
.
data
());
if
(
beta
!=
0.0
f
)
out_dev
.
ToDevice
(
out
.
mData
.
data
());
if
(
do_verify
)
{
ReductionHost
<
InOutDataType
,
AccDataType
,
InOutDataType
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
5
,
// Rank
2
,
// NumReduceDim
PropagateNan
,
OutputIndex
>
hostReduce
(
in_1
.
mDesc
,
out_ref
.
mDesc
,
invariantDims
,
reduceDims
);
hostReduce
.
Run
(
alpha
,
in_1
.
mData
.
data
(),
beta
,
out_ref
.
mData
.
data
(),
nullptr
);
};
std
::
vector
<
ck
::
index_t
>
i_inLengths_1
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_1
;
std
::
vector
<
ck
::
index_t
>
i_inLengths_2
;
std
::
vector
<
ck
::
index_t
>
i_inStrides_2
;
std
::
vector
<
ck
::
index_t
>
i_outLengths
;
std
::
vector
<
ck
::
index_t
>
i_outStrides
;
i_inLengths_1
.
assign
(
inLengths_1
.
begin
(),
inLengths_1
.
end
());
i_inStrides_1
.
assign
(
inStrides_1
.
begin
(),
inStrides_1
.
end
());
i_inLengths_2
.
assign
(
inLengths_2
.
begin
(),
inLengths_2
.
end
());
i_inStrides_2
.
assign
(
inStrides_2
.
begin
(),
inStrides_2
.
end
());
i_outLengths
.
assign
(
outLengths
.
begin
(),
outLengths
.
end
());
i_outStrides
.
assign
(
outStrides
.
begin
(),
outStrides
.
end
());
auto
reduce_1
=
DeviceReduceInstance_1
{};
auto
argument_ptr_1
=
reduce_1
.
MakeArgumentPointer
(
i_inLengths_1
,
i_inStrides_1
,
i_inLengths_2
,
i_inStrides_2
,
reduceDims_1
,
1.0
f
,
0.0
f
,
in_1_dev
.
GetDeviceBuffer
(),
nullptr
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
InElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)},
PassThroughOp
{});
if
(
!
reduce_1
.
IsSupportedArgument
(
argument_ptr_1
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
};
auto
invoker_ptr_1
=
reduce_1
.
MakeInvokerPointer
();
auto
reduce_2
=
DeviceReduceInstance_2
{};
auto
argument_ptr_2
=
reduce_2
.
MakeArgumentPointer
(
i_inLengths_2
,
i_inStrides_2
,
i_outLengths
,
i_outStrides
,
reduceDims_2
,
alpha
,
beta
,
in_2_dev
.
GetDeviceBuffer
(),
nullptr
,
out_dev
.
GetDeviceBuffer
(),
nullptr
,
PassThroughOp
{},
AccElementwiseOperation
{
static_cast
<
int32_t
>
(
reduce_total_length
)});
if
(
!
reduce_2
.
IsSupportedArgument
(
argument_ptr_2
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
<<
std
::
endl
;
};
auto
invoker_ptr_2
=
reduce_2
.
MakeInvokerPointer
();
float
avg_time_1
=
invoker_ptr_1
->
Run
(
argument_ptr_1
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
avg_time_2
=
invoker_ptr_2
->
Run
(
argument_ptr_2
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
invariant_total_length
*
reduce_total_length
*
sizeof
(
InOutDataType
)
+
invariant_total_length
*
sizeof
(
InOutDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
(
avg_time_1
+
avg_time_2
);
std
::
cout
<<
"Perf: "
<<
avg_time_1
+
avg_time_2
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
reduce_1
.
GetTypeString
()
<<
" => "
<<
reduce_2
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verify
)
{
out_dev
.
FromDevice
(
out
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out
.
mData
,
out_ref
.
mData
);
};
return
(
pass
?
0
:
1
);
}
example/13_pool2d_fwd/CMakeLists.txt
View file @
a3b4c5cb
add_example_executable
(
example_pool2d_fwd pool2d_fwd.cpp
)
add_example_executable
(
example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp
)
add_example_executable
(
example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp
)
example/13_pool2d_fwd/README.md
View file @
a3b4c5cb
# Instructions for ```example_pool2d_fwd``` Example
# Instructions for ```example_pool2d_fwd``` Example
s
## Run ```example_pool2d_fwd```
## Run ```example_pool2d_fwd
_fp16
```
```
bash
```
bash
#arg1: verification (0=no, 1=yes)
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3:
run
kernel
# of times (>1
)
#arg3:
time
kernel
(0=no, 1=yes
)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd 1 1 1
0
./bin/example_pool2d_fwd
_fp16
1 1 1
```
```
Result
Result
...
@@ -14,9 +14,28 @@ Result
...
@@ -14,9 +14,28 @@ Result
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Warm up
1 time
Start running 10 times...
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
error: 0
```
max_diff: 0, 1, 1
## Run ```example_pool2d_fwd_fp32```
```
bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd_fp32 1 1 1
```
Result
```
./bin/example_pool2d_fwd_fp32 1 1 1
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
```
```
example/13_pool2d_fwd/pool2d_fwd
.c
pp
→
example/13_pool2d_fwd/pool2d_fwd
_common.h
pp
View file @
a3b4c5cb
#pragma once
#include <iostream>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "check_err.hpp"
#include "config.hpp"
#include "config.hpp"
...
@@ -10,89 +8,67 @@
...
@@ -10,89 +8,67 @@
#include "device.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_tensor_generator.hpp"
#include "host_reduce_util.hpp"
#include "device_tensor.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "tensor_layout.hpp"
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
#include "reduction_operator_mapping.hpp"
#include "reduction_functions_accumulate.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
NeedIndices
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
using
DevicePoolFwdInstance
=
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
AccDataType
,
// AccDataType
ReduceOpId
,
NeedIndices
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
OutDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
AccDataType
,
typename
IndexDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
NeedIndices
>
bool
OutputIndex
>
static
void
pool_host_verify
(
const
Tensor
<
InDataType
>&
in
,
static
void
pool_host_verify
(
const
Tensor
<
InDataType
>&
in
,
Tensor
<
OutDataType
>&
out
,
Tensor
<
OutDataType
>&
out
,
Tensor
<
int
>&
out_indices
,
Tensor
<
IndexDataType
>&
out_indices
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_spatial_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_spatial_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_strides
,
const
std
::
array
<
ck
::
index_t
,
2
>&
window_strides
,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
in_left_pads
,
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
const
std
::
array
<
ck
::
index_t
,
2
>&
/*in_right_pads*/
)
{
{
using
namespace
ck
::
host_reduce
;
const
int32_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
]
;
const
int
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
ck
::
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
const
auto
PreUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
const
InElementwiseOperation
in_elementwise_op
(
divider
);
const
auto
PosUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
const
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
!
NeedIndices
)
if
constexpr
(
!
OutputIndex
)
{
{
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
auto
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
for
(
in
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
{
in
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
ck
::
index_
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
in
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
for
(
ck
::
index_
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
{
in
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
ck
::
index_
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
static_cast
<
ck
::
index_t
>
(
in
.
mDesc
.
GetLengths
()[
2
]
)
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
wi
>=
0
&&
wi
<
static_cast
<
ck
::
index_t
>
(
in
.
mDesc
.
GetLengths
()[
3
])
)
{
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
}
}
}
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
};
};
...
@@ -105,33 +81,34 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -105,33 +81,34 @@ static void pool_host_verify(const Tensor<InDataType>& in,
}
}
else
else
{
{
auto
opReduce
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
f_nchw
=
[
&
](
auto
n
,
auto
c
,
auto
ho
,
auto
wo
)
{
auto
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
auto
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
int
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
for
(
in
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
for
(
ck
::
index_
t
y
=
0
;
y
<
window_spatial_lengths
[
0
];
++
y
)
{
{
in
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
ck
::
index_
t
hi
=
ho
*
window_strides
[
0
]
+
y
-
in_left_pads
[
0
];
for
(
in
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
for
(
ck
::
index_
t
x
=
0
;
x
<
window_spatial_lengths
[
1
];
++
x
)
{
{
in
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
ck
::
index_
t
wi
=
wo
*
window_strides
[
1
]
+
x
-
in_left_pads
[
1
];
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
if
(
hi
>=
0
&&
hi
<
in
.
mDesc
.
GetLengths
()[
2
]
&&
wi
>=
0
&&
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
wi
<
in
.
mDesc
.
GetLengths
()[
3
])
{
{
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
AccDataType
currVal
=
static_cast
<
AccDataType
>
(
in
(
n
,
c
,
hi
,
wi
));
int
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
IndexDataType
currIndex
=
y
*
window_spatial_lengths
[
1
]
+
x
;
PreUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check2
<
AccDataType
,
PropagateNan
>
(
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
opReduce
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
}
}
}
}
}
}
PosUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
out
(
n
,
c
,
ho
,
wo
)
=
accuVal
;
out_indices
(
n
,
c
,
ho
,
wo
)
=
accuIndex
;
out_indices
(
n
,
c
,
ho
,
wo
)
=
accuIndex
;
...
@@ -145,62 +122,44 @@ static void pool_host_verify(const Tensor<InDataType>& in,
...
@@ -145,62 +122,44 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
};
}
}
int
main
(
int
argc
,
char
*
argv
[])
template
<
typename
InDataType
,
typename
OutDataType
,
typename
AccDataType
,
typename
IndexDataType
,
typename
InLayout
,
typename
OutLayout
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
OutputIndex
>
bool
pool_test
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
{
using
namespace
ck
::
host_reduce
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
bool
do_verification
=
0
;
InDataType
,
// InDataType
int
init_method
=
0
;
OutDataType
,
// OutDataType
int
nrepeat
=
5
;
AccDataType
,
// AccDataType
ReduceOpId
,
// Pool shape
OutputIndex
,
ck
::
index_t
N
=
128
;
64
,
// BlockSize
ck
::
index_t
C
=
192
;
64
,
// ReduceMThreadClusterSize
ck
::
index_t
Y
=
3
;
1
,
// ReduceKThreadClusterSize
ck
::
index_t
X
=
3
;
4
,
// ReduceMThreadSliceSize
ck
::
index_t
Hi
=
71
;
1
,
// ReduceKThreadSliceSize
ck
::
index_t
Wi
=
71
;
4
>
;
// InSrcOutDstVectorSize
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
nrepeat
=
std
::
stoi
(
argv
[
3
]);
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: run kernel # of times (>1)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
...
@@ -228,9 +187,11 @@ int main(int argc, char* argv[])
...
@@ -228,9 +187,11 @@ int main(int argc, char* argv[])
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
int
>
out_indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
int
>
out_indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
...
@@ -245,17 +206,17 @@ int main(int argc, char* argv[])
...
@@ -245,17 +206,17 @@ int main(int argc, char* argv[])
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
DeviceMem
out_indices_device_buf
(
sizeof
(
int
)
*
DeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
out_indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpace
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
auto
pool
=
DevicePoolFwdInstance
{};
auto
pool
=
DevicePoolFwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
pool
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
int
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
C
,
C
,
std
::
array
<
ck
::
index_t
,
2
>
{{
Hi
,
Wi
}},
std
::
array
<
ck
::
index_t
,
2
>
{{
Hi
,
Wi
}},
...
@@ -271,7 +232,7 @@ int main(int argc, char* argv[])
...
@@ -271,7 +232,7 @@ int main(int argc, char* argv[])
"not support this problem"
);
"not support this problem"
);
}
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
}
);
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
C
*
Ho
*
Wo
*
Y
*
X
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
N
*
C
*
Ho
*
Wo
*
Y
*
X
;
...
@@ -285,14 +246,17 @@ int main(int argc, char* argv[])
...
@@ -285,14 +246,17 @@ int main(int argc, char* argv[])
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
if
(
do_verification
)
{
{
pool_host_verify
<
InDataType
,
pool_host_verify
<
InDataType
,
OutDataType
,
OutDataType
,
AccDataType
,
AccDataType
,
IndexDataType
,
ReduceOpId
,
ReduceOpId
,
PropagateNan
,
PropagateNan
,
NeedIndices
>
(
in_n_c_hi_wi
,
OutputIndex
>
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
out_n_c_ho_wo_host
,
out_indices_n_c_ho_wo_host
,
out_indices_n_c_ho_wo_host
,
window_spatial_lengths
,
window_spatial_lengths
,
...
@@ -302,14 +266,16 @@ int main(int argc, char* argv[])
...
@@ -302,14 +266,16 @@ int main(int argc, char* argv[])
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
.
mData
,
out_n_c_ho_wo_host
.
mData
);
if
constexpr
(
NeedIndices
)
if
constexpr
(
OutputIndex
)
{
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
mData
.
data
());
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_ho_wo_device
.
mData
.
data
());
//
ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices_n_c_ho_wo_device
.
mData
,
//
out_indices_n_c_ho_wo_host.mData);
;
out_indices_n_c_ho_wo_host
.
mData
);
};
};
}
}
}
return
(
pass
);
};
example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
0 → 100644
View file @
a3b4c5cb
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
// Pool shape
ck
::
index_t
N
=
128
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
do_verification
=
true
;
init_method
=
1
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
bool
pass
=
pool_test
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
0 → 100644
View file @
a3b4c5cb
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
AccDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
;
int
init_method
;
bool
time_kernel
;
// Pool shape
ck
::
index_t
N
=
128
;
ck
::
index_t
C
=
192
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
71
;
ck
::
index_t
Wi
=
71
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
if
(
argc
==
1
)
{
do_verification
=
true
;
init_method
=
1
;
time_kernel
=
true
;
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
}
else
if
(
argc
==
16
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
stoi
(
argv
[
3
]));
N
=
std
::
stoi
(
argv
[
4
]);
C
=
std
::
stoi
(
argv
[
5
]);
Y
=
std
::
stoi
(
argv
[
6
]);
X
=
std
::
stoi
(
argv
[
7
]);
Hi
=
std
::
stoi
(
argv
[
8
]);
Wi
=
std
::
stoi
(
argv
[
9
]);
window_stride_h
=
std
::
stoi
(
argv
[
10
]);
window_stride_w
=
std
::
stoi
(
argv
[
11
]);
in_left_pad_h
=
std
::
stoi
(
argv
[
12
]);
in_left_pad_w
=
std
::
stoi
(
argv
[
13
]);
in_right_pad_h
=
std
::
stoi
(
argv
[
14
]);
in_right_pad_w
=
std
::
stoi
(
argv
[
15
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx
\n
"
);
exit
(
0
);
}
bool
pass
=
pool_test
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
init_method
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
Prev
1
2
3
4
5
6
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment