"vscode:/vscode.git/clone" did not exist on "dfa0213942000be269a1b05058f26893e7e9e56f"
Unverified Commit 827301d9 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Pr82 followup (#115)

* Use thread cluster descriptor and explicit M_K 2d descriptor to simply Blockwise Reduction

* Change by replacing ReduceDims by NumReduceDims as Device Reduce interface template parameter

* Rename the folder name for the pool2d and reduce examples

* Update to reduction test scripts

* Add Readme for pool2d_fwd and reduce_blockwise examples

* Tiny fix in reduce profiler and tiny update in reduce testing scripts

* Tiny fix in testing script profile_reduce_no_index.sh

* Tiny change in script/profile_reduce_with_index.sh

* Renaming and refining in Reduction profiler/device layer/examples

* Renaming and refining in Reduction profiler/device layer/examples

* Renaming all NumReduceDims to NumReduceDim
parent 5d37d7bf
...@@ -6,37 +6,37 @@ namespace device { ...@@ -6,37 +6,37 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);
// Will be moved to use MultiBlockAtomicAdd // Will be moved to use MultiBlockAtomicAdd
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // ADD_MULTIBLOCK_PARTIAL_REDUCE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -6,25 +6,25 @@ namespace device { ...@@ -6,25 +6,25 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -6,16 +6,16 @@ namespace device { ...@@ -6,16 +6,16 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 0); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -6,34 +6,34 @@ namespace device { ...@@ -6,34 +6,34 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 0); ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -6,16 +6,16 @@ namespace device { ...@@ -6,16 +6,16 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 0); ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -6,34 +6,34 @@ namespace device { ...@@ -6,34 +6,34 @@ namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | ReduceDims // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0, 1, 2); // for ADD ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 0); ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0, 1, 2); // for AVG ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0, 1, 2); // for NORM2 ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0, 1, 2); // for MIN ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0, 1, 2); // for MAX ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0, 1, 2); // for AMAX ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 0); // ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); // ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
} // namespace device_reduce_instance } // namespace device_reduce_instance
......
...@@ -9,54 +9,52 @@ namespace tensor_operation { ...@@ -9,54 +9,52 @@ namespace tensor_operation {
namespace device { namespace device {
namespace device_reduce_instance { namespace device_reduce_instance {
template <int Rank, typename ReduceDims, int ReduceOpId, int NanOpt, int IndicesOpt> template <int Rank, int NumReduceDim, int ReduceOpId, int NanOpt, int IndicesOpt>
struct ReduceDescription struct ReduceDescription
{ {
static constexpr int Rank_ = Rank; static constexpr int Rank_ = Rank;
static constexpr int ReduceOpId_ = ReduceOpId; static constexpr int NumReduceDim_ = NumReduceDim;
static constexpr int NanOpt_ = NanOpt; static constexpr int ReduceOpId_ = ReduceOpId;
static constexpr int IndicesOpt_ = IndicesOpt; static constexpr int NanOpt_ = NanOpt;
static constexpr int IndicesOpt_ = IndicesOpt;
using ReduceDims_ = ReduceDims;
}; };
using reduce_description_instances = using reduce_description_instances = std::tuple<ReduceDescription<4, 3, 0, 0, 0>, // for ADD
std::tuple<ReduceDescription<4, Sequence<0, 1, 2>, 0, 0, 0>, // for ADD ReduceDescription<4, 1, 0, 0, 0>,
ReduceDescription<4, Sequence<0>, 0, 0, 0>, ReduceDescription<2, 1, 0, 0, 0>,
ReduceDescription<2, Sequence<1>, 0, 0, 0>,
ReduceDescription<4, 3, 5, 0, 0>, // for AVG
ReduceDescription<4, Sequence<0, 1, 2>, 5, 0, 0>, // for AVG ReduceDescription<4, 1, 5, 0, 0>,
ReduceDescription<4, Sequence<0>, 5, 0, 0>, ReduceDescription<2, 1, 5, 0, 0>,
ReduceDescription<2, Sequence<1>, 5, 0, 0>,
ReduceDescription<4, 3, 7, 0, 0>, // for NORM2
ReduceDescription<4, Sequence<0, 1, 2>, 7, 0, 0>, // for NORM2 ReduceDescription<4, 1, 7, 0, 0>,
ReduceDescription<4, Sequence<0>, 7, 0, 0>, ReduceDescription<2, 1, 7, 0, 0>,
ReduceDescription<2, Sequence<1>, 7, 0, 0>,
ReduceDescription<4, 3, 2, 0, 0>, // for MIN
ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 0>, // for MIN ReduceDescription<4, 1, 2, 0, 0>,
ReduceDescription<4, Sequence<0>, 2, 0, 0>, ReduceDescription<2, 1, 2, 0, 0>,
ReduceDescription<2, Sequence<1>, 2, 0, 0>, ReduceDescription<4, 3, 3, 0, 0>, // for MAX
ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 0>, // for MAX ReduceDescription<4, 1, 3, 0, 0>,
ReduceDescription<4, Sequence<0>, 3, 0, 0>, ReduceDescription<2, 1, 3, 0, 0>,
ReduceDescription<2, Sequence<1>, 3, 0, 0>, ReduceDescription<4, 3, 4, 0, 0>, // for AMAX
ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 0>, // for AMAX ReduceDescription<4, 1, 4, 0, 0>,
ReduceDescription<4, Sequence<0>, 4, 0, 0>, ReduceDescription<2, 1, 4, 0, 0>,
ReduceDescription<2, Sequence<1>, 4, 0, 0>,
ReduceDescription<4, 3, 2, 0, 1>, // for MIN
ReduceDescription<4, Sequence<0, 1, 2>, 2, 0, 1>, // for MIN ReduceDescription<4, 1, 2, 0, 1>,
ReduceDescription<4, Sequence<0>, 2, 0, 1>, ReduceDescription<2, 1, 2, 0, 1>,
ReduceDescription<2, Sequence<1>, 2, 0, 1>, ReduceDescription<4, 3, 3, 0, 1>, // for MAX
ReduceDescription<4, Sequence<0, 1, 2>, 3, 0, 1>, // for MAX ReduceDescription<4, 1, 3, 0, 1>,
ReduceDescription<4, Sequence<0>, 3, 0, 1>, ReduceDescription<2, 1, 3, 0, 1>,
ReduceDescription<2, Sequence<1>, 3, 0, 1>, ReduceDescription<4, 3, 4, 0, 1>, // for AMAX
ReduceDescription<4, Sequence<0, 1, 2>, 4, 0, 1>, // for AMAX ReduceDescription<4, 1, 4, 0, 1>,
ReduceDescription<4, Sequence<0>, 4, 0, 1>, ReduceDescription<2, 1, 4, 0, 1>>;
ReduceDescription<2, Sequence<1>, 4, 0, 1>>;
template <typename DescriptionType> template <typename DescriptionType>
bool description_match(const DescriptionType& description, bool description_match(const DescriptionType& description,
int Rank, int Rank,
const std::vector<int>& ReduceDims, const std::vector<int>& reduceDims,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp_t ReduceOpId,
NanPropagation_t NanOpt, NanPropagation_t NanOpt,
ReduceTensorIndices_t IndicesOpt) ReduceTensorIndices_t IndicesOpt)
...@@ -66,16 +64,11 @@ bool description_match(const DescriptionType& description, ...@@ -66,16 +64,11 @@ bool description_match(const DescriptionType& description,
description.IndicesOpt_ != static_cast<int>(IndicesOpt)) description.IndicesOpt_ != static_cast<int>(IndicesOpt))
return (false); return (false);
if(DescriptionType::ReduceDims_::Size() != ReduceDims.size()) if(DescriptionType::NumReduceDim_ != reduceDims.size())
return (false); return (false);
bool result = true; bool result = true;
static_for<0, DescriptionType::ReduceDims_::Size(), 1>{}([&](auto i) {
if(DescriptionType::ReduceDims_::At(i) != ReduceDims[i])
result = false;
});
return (result); return (result);
}; };
...@@ -87,33 +80,29 @@ bool description_match(const DescriptionType& description, ...@@ -87,33 +80,29 @@ bool description_match(const DescriptionType& description,
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <int Rank, typename ReduceDims> template <index_t Rank, index_t NumReduceDim>
static std::vector<int> get_reduce_dims() static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduceDims)
{
std::vector<int> resDims;
static_for<0, ReduceDims::Size(), 1>{}([&](auto i) { resDims.push_back(ReduceDims::At(i)); });
return (resDims);
};
template <int Rank, typename ReduceDims>
static std::vector<int> get_invariant_dims()
{ {
std::vector<int> resDims; assert(NumReduceDim == reduceDims.size());
unsigned int incFlag = 0;
static_for<0, ReduceDims::Size(), 1>{}( int reduceFlag = 0;
[&](auto i) { incFlag = incFlag | (0x1 << ReduceDims::At(i)); });
for(int dim = 0; dim < Rank; dim++) // flag the bits for the reduceDims
for(int i = 0; i < NumReduceDim; i++)
{ {
if(incFlag & (0x1 << dim)) reduceFlag |= 1 << reduceDims[i];
continue;
resDims.push_back(dim);
}; };
return (resDims); std::vector<int> invariantDims;
// collect invariant dimensions
for(int i = 0; i < Rank; i++)
if((reduceFlag & (1 << i)) == 0)
{
invariantDims.push_back(i);
};
return invariantDims;
}; };
template <typename T> template <typename T>
...@@ -149,7 +138,7 @@ template <typename InDataType, ...@@ -149,7 +138,7 @@ template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
int Rank, int Rank,
typename ReduceDims_, int NumReduceDim,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp_t ReduceOpId,
NanPropagation_t NanOpt, NanPropagation_t NanOpt,
ReduceTensorIndices_t IndicesOpt> ReduceTensorIndices_t IndicesOpt>
...@@ -159,6 +148,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -159,6 +148,7 @@ void profile_reduce_impl_impl(bool do_verification,
bool do_dumpout, bool do_dumpout,
int nrepeat, int nrepeat,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims,
float alpha, float alpha,
float beta) float beta)
{ {
...@@ -203,15 +193,14 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -203,15 +193,14 @@ void profile_reduce_impl_impl(bool do_verification,
{ {
Tensor<InDataType> in(inLengths); Tensor<InDataType> in(inLengths);
const std::vector<int> OuterDims = get_invariant_dims<Rank, ReduceDims_>();
const std::vector<int> ReduceDims = get_reduce_dims<Rank, ReduceDims_>();
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
if(OuterDims.empty()) const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
if(reduceDims.size() == Rank)
outLengths.push_back(1); outLengths.push_back(1);
else else
for(auto dim : OuterDims) for(auto dim : invariantDims)
outLengths.push_back(inLengths[dim]); outLengths.push_back(inLengths[dim]);
Tensor<OutDataType> out_ref(outLengths); Tensor<OutDataType> out_ref(outLengths);
...@@ -302,7 +291,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -302,7 +291,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOpId, ReduceOpId,
NanOpt, NanOpt,
IndicesOpt>(reduce0_ptrs); IndicesOpt>(reduce0_ptrs);
...@@ -311,7 +300,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -311,7 +300,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOpId, ReduceOpId,
NanOpt, NanOpt,
IndicesOpt>(reduce0_ptrs); IndicesOpt>(reduce0_ptrs);
...@@ -321,7 +310,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -321,7 +310,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOpId, ReduceOpId,
NanOpt, NanOpt,
IndicesOpt>(reduce0_ptrs); IndicesOpt>(reduce0_ptrs);
...@@ -330,7 +319,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -330,7 +319,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOpId, ReduceOpId,
NanOpt, NanOpt,
IndicesOpt>(reduce1_ptrs); IndicesOpt>(reduce1_ptrs);
...@@ -341,7 +330,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -341,7 +330,7 @@ void profile_reduce_impl_impl(bool do_verification,
AccDataType, AccDataType,
OutDataType, OutDataType,
Rank, Rank,
ReduceDims_, NumReduceDim,
ReduceOpId, ReduceOpId,
NanOpt, NanOpt,
IndicesOpt>(reduce2_ptrs); IndicesOpt>(reduce2_ptrs);
...@@ -358,7 +347,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -358,7 +347,7 @@ void profile_reduce_impl_impl(bool do_verification,
using hCompType = typename type_mapping<AccDataType>::outDataType; using hCompType = typename type_mapping<AccDataType>::outDataType;
ReductionHost<hInType, hCompType, hOutType, ReduceOpId, PropagateNan, NeedIndices> ReductionHost<hInType, hCompType, hOutType, ReduceOpId, PropagateNan, NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, OuterDims, ReduceDims); hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha,
reinterpret_cast<const hInType*>(in.mData.data()), reinterpret_cast<const hInType*>(in.mData.data()),
...@@ -383,6 +372,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -383,6 +372,7 @@ void profile_reduce_impl_impl(bool do_verification,
i_inStrides, i_inStrides,
i_outLengths, i_outLengths,
i_outStrides, i_outStrides,
reduceDims,
alpha, alpha,
beta, beta,
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
...@@ -464,6 +454,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -464,6 +454,7 @@ void profile_reduce_impl_impl(bool do_verification,
i_inStrides, i_inStrides,
i_outLengths, i_outLengths,
i_outStrides, i_outStrides,
reduceDims,
alpha, alpha,
beta, beta,
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
...@@ -496,6 +487,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -496,6 +487,7 @@ void profile_reduce_impl_impl(bool do_verification,
inStrides2, inStrides2,
i_outLengths, i_outLengths,
i_outStrides, i_outStrides,
reduceDims,
alpha, alpha,
beta, beta,
ws_dev.GetDeviceBuffer(), ws_dev.GetDeviceBuffer(),
...@@ -584,7 +576,7 @@ void profile_reduce_impl(bool do_verification, ...@@ -584,7 +576,7 @@ void profile_reduce_impl(bool do_verification,
bool do_dumpout, bool do_dumpout,
int nrepeat, int nrepeat,
const std::vector<size_t>& inLengths, const std::vector<size_t>& inLengths,
const std::vector<int>& ReduceDims, const std::vector<int>& reduceDims,
ReduceTensorOp_t ReduceOpId, ReduceTensorOp_t ReduceOpId,
NanPropagation_t NanOpt, NanPropagation_t NanOpt,
ReduceTensorIndices_t IndicesOpt, ReduceTensorIndices_t IndicesOpt,
...@@ -605,18 +597,26 @@ void profile_reduce_impl(bool do_verification, ...@@ -605,18 +597,26 @@ void profile_reduce_impl(bool do_verification,
using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>; using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
if(!description_match( if(!description_match(
descType{}, inLengths.size(), ReduceDims, ReduceOpId, NanOpt, IndicesOpt)) descType{}, inLengths.size(), reduceDims, ReduceOpId, NanOpt, IndicesOpt))
return; return;
profile_reduce_impl_impl<InDataType, profile_reduce_impl_impl<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
descType::Rank_, descType::Rank_,
typename descType::ReduceDims_, descType::NumReduceDim_,
static_cast<ReduceTensorOp_t>(descType::ReduceOpId_), static_cast<ReduceTensorOp_t>(descType::ReduceOpId_),
static_cast<NanPropagation_t>(descType::NanOpt_), static_cast<NanPropagation_t>(descType::NanOpt_),
static_cast<ReduceTensorIndices_t>(descType::IndicesOpt_)>( static_cast<ReduceTensorIndices_t>(descType::IndicesOpt_)>(
do_verification, init_method, do_log, do_dumpout, nrepeat, inLengths, alpha, beta); do_verification,
init_method,
do_log,
do_dumpout,
nrepeat,
inLengths,
reduceDims,
alpha,
beta);
matched = true; matched = true;
}); });
......
...@@ -25,7 +25,7 @@ using ck::ReduceTensorIndices_t; ...@@ -25,7 +25,7 @@ using ck::ReduceTensorIndices_t;
using ck::ReduceTensorOp_t; using ck::ReduceTensorOp_t;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'}, static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
{"toReduceDims", required_argument, nullptr, 'R'}, {"reduceDims", required_argument, nullptr, 'R'},
{"reduceOp", required_argument, nullptr, 'O'}, {"reduceOp", required_argument, nullptr, 'O'},
{"compType", required_argument, nullptr, 'C'}, {"compType", required_argument, nullptr, 'C'},
{"outType", required_argument, nullptr, 'W'}, {"outType", required_argument, nullptr, 'W'},
...@@ -93,9 +93,9 @@ typedef enum ...@@ -93,9 +93,9 @@ typedef enum
appDouble = 6, appDouble = 6,
} appDataType_t; } appDataType_t;
static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDims) static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{ {
for(auto dim : toReduceDims) for(auto dim : reduceDims)
{ {
if(dim < 0 || dim >= rank) if(dim < 0 || dim >= rank)
throw std::runtime_error("Invalid dimension index specified for Reducing"); throw std::runtime_error("Invalid dimension index specified for Reducing");
...@@ -103,7 +103,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi ...@@ -103,7 +103,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& toReduceDi
unsigned int flag = 0; unsigned int flag = 0;
for(auto dim : toReduceDims) for(auto dim : reduceDims)
{ {
if(flag & (0x1 << dim)) if(flag & (0x1 << dim))
throw std::runtime_error("All toReduce dimensions should be different!"); throw std::runtime_error("All toReduce dimensions should be different!");
...@@ -122,7 +122,7 @@ class AppArgs ...@@ -122,7 +122,7 @@ class AppArgs
std::vector<size_t> inLengths; std::vector<size_t> inLengths;
std::vector<size_t> outLengths; std::vector<size_t> outLengths;
std::vector<int> toReduceDims; std::vector<int> reduceDims;
std::vector<float> scales; std::vector<float> scales;
...@@ -152,7 +152,7 @@ class AppArgs ...@@ -152,7 +152,7 @@ class AppArgs
std::cout << "Usage of " << cmd << std::endl; std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths" std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
<< std::endl; << std::endl;
std::cout << "--toReduceDims or -R, comma separated list of to-reduce dimensions" std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
<< std::endl; << std::endl;
std::cout << "--reduceOp or -O, enum value indicating the reduction operations" std::cout << "--reduceOp or -O, enum value indicating the reduction operations"
<< std::endl; << std::endl;
...@@ -201,7 +201,7 @@ class AppArgs ...@@ -201,7 +201,7 @@ class AppArgs
if(!optarg) if(!optarg)
throw std::runtime_error("Invalid option format!"); throw std::runtime_error("Invalid option format!");
toReduceDims = getTypeValuesFromString<int>(optarg); reduceDims = getTypeValuesFromString<int>(optarg);
break; break;
case 'O': case 'O':
if(!optarg) if(!optarg)
...@@ -321,7 +321,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -321,7 +321,7 @@ int profile_reduce(int argc, char* argv[])
int rank = args.inLengths.size(); int rank = args.inLengths.size();
check_reduce_dims(rank, args.toReduceDims); check_reduce_dims(rank, args.reduceDims);
if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1) if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1)
throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!"); throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!");
...@@ -345,7 +345,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -345,7 +345,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.nrepeat,
args.inLengths, args.inLengths,
args.toReduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, args.nanOpt,
args.indicesOpt, args.indicesOpt,
...@@ -360,7 +360,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -360,7 +360,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.nrepeat,
args.inLengths, args.inLengths,
args.toReduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, args.nanOpt,
args.indicesOpt, args.indicesOpt,
...@@ -378,7 +378,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -378,7 +378,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.nrepeat,
args.inLengths, args.inLengths,
args.toReduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, args.nanOpt,
args.indicesOpt, args.indicesOpt,
...@@ -395,7 +395,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -395,7 +395,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.nrepeat,
args.inLengths, args.inLengths,
args.toReduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, args.nanOpt,
args.indicesOpt, args.indicesOpt,
...@@ -410,7 +410,7 @@ int profile_reduce(int argc, char* argv[]) ...@@ -410,7 +410,7 @@ int profile_reduce(int argc, char* argv[])
args.do_dumpout, args.do_dumpout,
args.nrepeat, args.nrepeat,
args.inLengths, args.inLengths,
args.toReduceDims, args.reduceDims,
args.reduceOp, args.reduceOp,
args.nanOpt, args.nanOpt,
args.indicesOpt, args.indicesOpt,
......
#!/bin/bash #!/bin/bash
PRECISION= ##--half PRECISION=
##PRECISION=--half
##PRECISION=--double
if test -n $PRECISION && test "$PRECISION" = "--half"; then if test -n $PRECISION && test "$PRECISION" = "--half"; then
CTYPE="-C 1" ACCTYPE="-C 1"
else else
CTYPE="" ACCTYPE=""
fi fi
WTYPE= driver="./bin/ckProfiler"
VERIFY="-v $1"
INIT=$2
NREPEAT=$3
if [ $# -ge 1 ] ; then
NREPEAT=$1
else
NREPEAT=1
fi
Operation=7 #### 0 - ADD, 5 - AVG, 7 - NORM2
Operations="0 5 7"
## for generic validation ## for generic validation
for op in $Operation; do for op in $Operations; do
set -x set -x
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT ####### datatype layout reduce dims op acctype verify init repeats
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 0 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 1 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
set +x set +x
done done
Operation=5 #### 0 - ADD, 5 - AVG, 7 - NORM2
Operations=5
## for performance evaluation (resnet50 NHWC => C) ## for performance evaluation (resnet50 NHWC => C)
for op in $Operation; do for op in $Operations; do
set -x set -x
./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT ####### datatype layout reduce dims op acctype verify init repeats
./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $CTYPE $WTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op $ACCTYPE $VERIFY $INIT $NREPEAT
set +x set +x
done done
#!/bin/bash #!/bin/bash
PRECISION= ##--half PRECISION=
##PRECISION=--half
##PRECISION=--double
if [ $# -ge 1 ] ; then driver="./bin/ckProfiler"
NREPEAT=$1
else
NREPEAT=1
fi
Operation=4 VERIFY="-v $1"
INIT=$2
NREPEAT=$3
LENGTHS=64,4,280,82 #### 2 - MIN, 3 - MAX, 4 - AMAX
Operations="2 4"
## for generic validation ## for generic validation
for op in $Operation; do for op in $Operations; do
for use_idx in 0 1; do for use_idx in 0 1; do
set -x set -x
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT ####### datatype layout reduce dims op use index verify init repeats
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 280,4,64,82 -R 0 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 4,64,280,82 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 64,280,82,4 -R 0,1,2 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,8192 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 1,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,1024 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,2,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 700,4 -R 1 -O $op $CTYPE -v 1 1 $NREPEAT $driver reduce $PRECISION -D 64,4,280,82 -R 0,1,3 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 256,22960 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 0 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 4,1469440 -R 1 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
set +x set +x
done done
done done
Operations=2
## for performance evaluation (resnet50 NHWC => C) ## for performance evaluation (resnet50 NHWC => C)
for op in $Operation; do for op in $Operations; do
for use_idx in 0 1; do for use_idx in 0 1; do
set -x set -x
./bin/ckProfiler reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT ####### datatype layout reduce dims op use index verify init repeats
./bin/ckProfiler reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 256,230,230,3 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,14,14,1024 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,28,28,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,58,58,128 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,7,7,2048 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,14,14,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,30,30,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,56,56,256 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,16,16,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,28,28,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
./bin/ckProfiler reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx -v 1 1 $NREPEAT $driver reduce $PRECISION -D 128,7,7,512 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
$driver reduce $PRECISION -D 128,56,56,64 -R 0,1,2 -O $op -I $use_idx $VERIFY $INIT $NREPEAT
set +x set +x
done done
done done
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment