Code clean up (#20)

* tuning para, * testing on v100 * add fp16 * remove deprecated tensor descriptor * sync with miopen * update build script Co-authored-by: Jing Zhang <jizhan@amd.com>

Code clean up (#20)
* tuning para, * testing on v100 * add fp16 * remove deprecated tensor descriptor * sync with miopen * update build script Co-authored-by: Jing Zhang <jizhan@amd.com>
5c7cec11 · Chao Liu · GitHub · 7d09790a · 5c7cec11 · 5c7cec11
Unverified Commit 5c7cec11 authored Jun 23, 2020 by Chao Liu Committed by GitHub Jun 23, 2020
14 changed files
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -3,38 +3,28 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
+#include <half.hpp>
 #include "config.hpp"
-#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "print_array.hpp"
 #include "print_sequence.hpp"
 #include "device.hpp"
-#include "tensor_generator.hpp"
+#include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
 #include "device_tensor.hpp"
-//#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
-//#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp"
-//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
-//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
-//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"

 int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 1
-    // 1x1
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 64;
-    constexpr index_t HI = 56;
-    constexpr index_t WI = 56;
+#if 0
+    // 1x1, 17x17
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
    constexpr index_t K  = 256;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;
@@ -45,12 +35,87 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x7
+    // 1x1, 8x8
    constexpr index_t N  = 128;
-    constexpr index_t C  = 1024;
+    constexpr index_t C  = 1536;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1, 73x73
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 160;
+    constexpr index_t HI = 73;
+    constexpr index_t WI = 73;
+    constexpr index_t K  = 64;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 3x3, 35x35
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 96;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 96;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    // 3x3, 71x71
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t HI = 71;
+    constexpr index_t WI = 71;
+    constexpr index_t K  = 192;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    // 7x1, 17x17
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 1024;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 1;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 1
+    // 1x7, 17x17
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 7;

@@ -60,59 +125,74 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 3>;
    using RightPads = Sequence<0, 3>;
 #elif 0
-    // 3x3, 34x34
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 34;
-    constexpr index_t WI = 34;
-    constexpr index_t K  = 256;
+    // 3x3, 299x299 stride=2
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 3;
+    constexpr index_t HI = 299;
+    constexpr index_t WI = 299;
+    constexpr index_t K  = 32;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
+    // 3x3, 147x147
+    // v4r4@v100 xx.xx%, cudnn@v100 xx.xx%
    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 128;
+    constexpr index_t C  = 32;
+    constexpr index_t HI = 147;
+    constexpr index_t WI = 147;
+    constexpr index_t K  = 64;
    constexpr index_t Y  = 3;
    constexpr index_t X  = 3;

-    using ConvStrides   = Sequence<2, 2>;
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 0
+    // 3x3, 149x149
+    // v4r4@v100 xx.xx%, cudnn@v100 xx.xx%
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 32;
+    constexpr index_t HI = 149;
+    constexpr index_t WI = 149;
+    constexpr index_t K  = 32;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 256;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
+    // 3x3, 17x17, stride 2
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t HI = 17;
+    constexpr index_t WI = 17;
+    constexpr index_t K  = 192;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
+    // 1x1, 35x35
    constexpr index_t N  = 128;
-    constexpr index_t C  = 2048;
-    constexpr index_t HI = 8;
-    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
+    constexpr index_t C  = 384;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
+    constexpr index_t K  = 96;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

@@ -121,110 +201,134 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 82%, ck@V100 76%, ck@P100 67%, ck@VII 64%
+#elif 1
+    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
+    constexpr index_t C  = 288;
+    constexpr index_t HI = 35;
+    constexpr index_t WI = 35;
    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 83%, ck@V100 75%, ck@P100 78%, ck@VII 65%
+    // 1x3, 8x8
    constexpr index_t N  = 128;
-    constexpr index_t C  = 1280;
+    constexpr index_t C  = 384;
    constexpr index_t HI = 8;
    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
+    constexpr index_t K  = 448;
    constexpr index_t Y  = 1;
-    constexpr index_t X  = 1;
+    constexpr index_t X  = 3;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads  = Sequence<0, 1>;
+    using RightPads = Sequence<0, 1>;
 #elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 62%, ck@V100 68%, ck@P100 70%, ck@VII 50%
+    // 3x1, 8x8
    constexpr index_t N  = 128;
-    constexpr index_t C  = 512;
-    constexpr index_t HI = 14;
-    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
+    constexpr index_t C  = 448;
+    constexpr index_t HI = 8;
+    constexpr index_t WI = 8;
+    constexpr index_t K  = 512;
+    constexpr index_t Y  = 3;
    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 0>;
+    using LeftPads  = Sequence<1, 0>;
+    using RightPads = Sequence<1, 0>;
 #elif 0
-    // 1x1 filter, 8x8 image
-    // cudnn@V100 74%, ck@V100 57%, ck@P100 78%, ck@VII 61%
-    constexpr index_t N  = 64;
-    constexpr index_t C  = 1536;
+    // 3x1, 8x8
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 448;
    constexpr index_t HI = 8;
    constexpr index_t WI = 8;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 1;
+    constexpr index_t K  = 512;
+    constexpr index_t Y  = 3;
    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

+    using LeftPads  = Sequence<1, 0>;
+    using RightPads = Sequence<1, 0>;
+#elif 0
+    // 3x3, 147x147
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 147;
+    constexpr index_t WI = 147;
+    constexpr index_t K  = 96;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 28x28 image
-    // cudnn@V100 86%, ck@V100 84%, ck@P100 80%, ck@VII 69%
+    // 7x1, 73x73
+    // v44@v100 xx.xx%, cudnn@v100 xx.xx%
    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
-    constexpr index_t HI = 28;
-    constexpr index_t WI = 28;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 73;
+    constexpr index_t WI = 73;
+    constexpr index_t K  = 64;
+    constexpr index_t Y  = 7;
    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

+    using LeftPads  = Sequence<3, 0>;
+    using RightPads = Sequence<3, 0>;
+#elif 0
+    // 3x3, 73x73
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 73;
+    constexpr index_t WI = 73;
+    constexpr index_t K  = 96;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 71%, ck@V100 55%, ck@P100 70%, ck@VII 62%
+    // 1x1, 14x14, stride 2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 256;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 2048;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 17x17 input
-    // cudnn@V100 81%, ck@V100 76%, ck@P100 70%, ck@VII 76%
+    // 1x1, 14x14
    constexpr index_t N  = 128;
-    constexpr index_t C  = 768;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
+    constexpr index_t C  = 1024;
+    constexpr index_t HI = 14;
+    constexpr index_t WI = 14;
+    constexpr index_t K  = 256;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

@@ -234,63 +338,104 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 71%, ck@P100 70%, ck@VII 64%
+    // 1x1, 14x14, stride 2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
+    constexpr index_t C  = 1024;
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
-    constexpr index_t K  = 128;
+    constexpr index_t K  = 512;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 14x14 image
-    // cudnn@V100 73%, ck@V100 72%, ck@P100 79%, ck@VII 75%
+    // 3x3, 28x28
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 128;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 128;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 1
+    // 3x3, 14x14
    constexpr index_t N  = 128;
-    constexpr index_t C  = 528;
+    constexpr index_t C  = 256;
    constexpr index_t HI = 14;
    constexpr index_t WI = 14;
    constexpr index_t K  = 256;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
+#elif 1
+    // 1x1, 56x56, stride 2
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 56;
+    constexpr index_t WI = 56;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x1 filter, 7x7 image
-    // cudnn@V100 49%, ck@V100 50%, ck@P100 61%, ck@VII 52%
+    // 7x7, 230x230 stride=2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 832;
-    constexpr index_t HI = 7;
-    constexpr index_t WI = 7;
-    constexpr index_t K  = 128;
+    constexpr index_t C  = 3;
+    constexpr index_t HI = 230;
+    constexpr index_t WI = 230;
+    constexpr index_t K  = 64;
+    constexpr index_t Y  = 7;
+    constexpr index_t X  = 7;
+
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
+    // 1x1, 28x28, stride = 2
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 1024;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

-    using ConvStrides   = Sequence<1, 1>;
+    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
-    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
+    // 1x1, 28x28, stride 2
    constexpr index_t N  = 128;
-    constexpr index_t C  = 288;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 384;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 28;
+    constexpr index_t WI = 28;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<2, 2>;
    using ConvDilations = Sequence<1, 1>;
@@ -298,71 +443,92 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 5x5 filter, 2x2 pad, 7x7 input
+    // 1x1, 7x7
    constexpr index_t N  = 128;
-    constexpr index_t C  = 48;
+    constexpr index_t C  = 512;
    constexpr index_t HI = 7;
    constexpr index_t WI = 7;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 5;
-    constexpr index_t X  = 5;
+    constexpr index_t K  = 2048;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<2, 2>;
-    using RightPads = Sequence<2, 2>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
-    // 1x7 filter, 0x3 pad, 17x17 input
+    // 3x3, 7x7
    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 1;
-    constexpr index_t X  = 7;
+    constexpr index_t C  = 512;
+    constexpr index_t HI = 7;
+    constexpr index_t WI = 7;
+    constexpr index_t K  = 512;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<0, 3>;
-    using RightPads = Sequence<0, 3>;
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #elif 1
-    // 7x1 filter, 3x0 pad, 17x17 input
+    // 1x1, 56x56
    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
-    constexpr index_t HI = 17;
-    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
-    constexpr index_t Y  = 7;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 56;
+    constexpr index_t WI = 56;
+    constexpr index_t K  = 64;
+    constexpr index_t Y  = 1;
    constexpr index_t X  = 1;

    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

-    using LeftPads  = Sequence<3, 0>;
-    using RightPads = Sequence<3, 0>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 1
+    // 3x3, 56x56
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 64;
+    constexpr index_t HI = 56;
+    constexpr index_t WI = 56;
+    constexpr index_t K  = 64;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+
+    using ConvStrides   = Sequence<1, 1>;
+    using ConvDilations = Sequence<1, 1>;
+
+    using LeftPads  = Sequence<1, 1>;
+    using RightPads = Sequence<1, 1>;
 #endif

-    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
-    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor_deprecated(
+    auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
+    auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
+    auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});

-    ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
-    ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
-    ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
+    ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
+    ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
+    ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
    print_sequence("LeftPads", LeftPads{});
    print_sequence("RightPads", RightPads{});
    print_sequence("ConvStrides", ConvStrides{});
    print_sequence("ConvDilations", ConvDilations{});

+#if 1
    using in_data_t  = float;
    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
+#else
+    using in_data_t  = half_float::half;
+    using out_data_t = half_float::half;
+#endif
+
+    Tensor<in_data_t> in_nchw(make_HostTensorDescriptor(in_nchw_desc));
+    Tensor<in_data_t> wei_kcyx(make_HostTensorDescriptor(wei_kcyx_desc));
+    Tensor<out_data_t> out_nkhw_host(make_HostTensorDescriptor(out_nkhw_desc));
+    Tensor<out_data_t> out_nkhw_device(make_HostTensorDescriptor(out_nkhw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -399,42 +565,7 @@ int main(int argc, char* argv[])
 #endif
    }

-#if 0
-    device_convolution_direct_v2_nchw_kcyx_nkhw
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(in_nchw_desc,
-                                                              in_nchw,
-                                                              wei_kcyx_desc,
-                                                              wei_kcyx,
-                                                              out_nkhw_desc,
-                                                              out_nkhw_device,
-                                                              LeftPads{},
-                                                              RightPads{},
-                                                              nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(
-        in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
-        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
-                                                                    in_nchw,
-                                                                    wei_kcyx_desc,
-                                                                    wei_kcyx,
-                                                                    out_nkhw_desc,
-                                                                    out_nkhw_device,
-                                                                    ConvStrides{},
-                                                                    ConvDilations{},
-                                                                    nrepeat);
-#elif 0
+#if 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -446,36 +577,6 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(in_nchw_desc,
-                                                         in_nchw,
-                                                         wei_kcyx_desc,
-                                                         wei_kcyx,
-                                                         out_nkhw_desc,
-                                                         out_nkhw_device,
-                                                         ConvStrides{},
-                                                         ConvDilations{},
-                                                         nrepeat);
-#elif 0
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
-                                                                    in_nchw,
-                                                                    wei_kcyx_desc,
-                                                                    wei_kcyx,
-                                                                    out_nkhw_desc,
-                                                                    out_nkhw_device,
-                                                                    ConvStrides{},
-                                                                    ConvDilations{},
-                                                                    nrepeat);
 #elif 1
    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
@@ -492,7 +593,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 1
+#if 0
        if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 &&
           ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1)
        {

--- a/driver/src/device.cpp
+++ b/driver/src/device.cpp
@@ -6,7 +6,7 @@ DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
 #if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
 #elif CK_DEVICE_BACKEND_NVIDIA
-    checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+    cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize);
 #endif
 }

@@ -18,8 +18,7 @@ void DeviceMem::ToDevice(const void* p)
    hipGetErrorString(
        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
 #elif CK_DEVICE_BACKEND_NVIDIA
-    checkCudaErrors(
-        cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
+    cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice);
 #endif
 }

@@ -28,7 +27,7 @@ void DeviceMem::FromDevice(void* p)
 #if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
 #elif CK_DEVICE_BACKEND_NVIDIA
-    checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
+    cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost);
 #endif
 }

@@ -37,7 +36,7 @@ DeviceMem::~DeviceMem()
 #if CK_DEVICE_BACKEND_AMD
    hipGetErrorString(hipFree(mpDeviceBuf));
 #elif CK_DEVICE_BACKEND_NVIDIA
-    checkCudaErrors(cudaFree(mpDeviceBuf));
+    cudaFree(mpDeviceBuf);
 #endif
 }

@@ -68,8 +67,10 @@ struct KernelTimerImpl
    void Start()
    {
 #if CK_DEVICE_BACKEND_AMD
+        hipDeviceSynchronize();
        hipEventRecord(mStart, 0);
 #elif CK_DEVICE_BACKEND_NVIDIA
+        cudaDeviceSynchronize();
        cudaEventRecord(mStart, 0);
 #endif
    }

--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
 #include <boost/range/adaptor/transformed.hpp>
 #include <cassert>

-#include "tensor.hpp"
+#include "host_tensor.hpp"

 template <typename X>
-TensorDescriptor::TensorDescriptor(std::vector<X> lens) : mLens(lens)
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
 {
    this->CalculateStrides();
 }

 template <typename X, typename Y>
-TensorDescriptor::TensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
+HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
    : mLens(lens), mStrides(strides)
 {
 }

-void TensorDescriptor::CalculateStrides()
+void HostTensorDescriptor::CalculateStrides()
 {
    mStrides.clear();
    mStrides.resize(mLens.size(), 0);
@@ -27,21 +27,21 @@ void TensorDescriptor::CalculateStrides()
        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
 }

-std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
+std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }

-std::size_t TensorDescriptor::GetElementSize() const
+std::size_t HostTensorDescriptor::GetElementSize() const
 {
    assert(mLens.size() == mStrides.size());
    return std::accumulate(
        mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
 }

-std::size_t TensorDescriptor::GetElementSpace() const
+std::size_t HostTensorDescriptor::GetElementSpace() const
 {
    auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
    return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
 }

-const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; }
+const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }

-const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; }
+const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
--- a/external/half/include/half.hpp
+++ b/external/half/include/half.hpp
--- a/external/include/bfloat16_dev.hpp
+++ b/external/include/bfloat16_dev.hpp
--- a/script/cmake-cuda.sh
+++ b/script/cmake-cuda.sh
 #!/bin/bash

-rm -f CMakeCache.txt
-rm -f *.cmake
-rm -rf CMakeFiles
-
-MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
-MY_PROJECT_INSTALL=../install.dir
+MY_PROJECT_SOURCE=../../../
+ 
+ export CUDA_ROOT=/usr/local/cuda
+ export CPATH=$CPATH:$CUDA_ROOT/include
+ export LIBRARY_PATH=$LIBRARY_PATH:$CUDA_ROOT/lib64
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64

 cmake                                                                                       \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_CXX_COMPILER=clang++                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 -D DEVICE_BACKEND=NVIDIA                                                                    \
-D CUDA_COMMON_INCLUDE_DIR="/package/install/cuda/10.1/NVIDIA_CUDA-10.1_Samples/common/inc" \
 -D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -maxrregcount=128" \
 ${MY_PROJECT_SOURCE}

-#-D BOOST_ROOT="/package/install/boost_1.67.0"                                               \

-#-D CMAKE_CUDA_COMPILER="/package/install/cuda_10.0/bin/nvcc"                                \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -maxrregcount=128" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_70,code=sm_70" \
+#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_70,code=sm_70 -Xptxas -v -maxrregcount=128" \
--- a/script/cmake-cuda_docker.sh
+++ b/script/cmake-cuda_docker.sh
-#!/bin/bash
-
-MY_PROJECT_SOURCE=../../../
-MY_PROJECT_INSTALL=../install.dir
-
-export CUDA_ROOT=/usr/local/cuda
-export CPATH=$CPATH:$CUDA_ROOT/include
-export LIBRARY_PATH=$LIBRARY_PATH:$CUDA_ROOT/lib64
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_ROOT/lib64
-
-cmake                                                                                       \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
-D CMAKE_CXX_COMPILER=clang++-6.0                                                           \
-D CMAKE_BUILD_TYPE=Release                                                                 \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
-D DEVICE_BACKEND=NVIDIA                                                                    \
-D CUDA_COMMON_INCLUDE_DIR="/root/NVIDIA_CUDA-10.1_Samples/common/inc"                      \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -maxrregcount=128" \
-${MY_PROJECT_SOURCE}
-
-
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61" \
-#-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -maxrregcount=128" \
--- a/script/cmake-hip.sh
+++ b/script/cmake-hip.sh
 #!/bin/bash
-
 rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles
@@ -11,9 +10,10 @@ cmake
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                 \
 -D DEVICE_BACKEND="AMD"                                                                     \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v"                               \
-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906"                              \
+-D CMAKE_CXX_FLAGS="--amdgpu-target=gfx906"                                                 \
 -D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc                                               \
 -D CMAKE_PREFIX_PATH="/opt/rocm"                                                            \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                           \
 ${MY_PROJECT_SOURCE}
+
+#-D CMAKE_CXX_FLAGS="-gline-tables-only -v --amdgpu-target=gfx906"                           \
--- a/script/cmake-rocm3.5.sh
+++ b/script/cmake-rocm3.5.sh
+#!/bin/bash
+rm -f CMakeCache.txt
+rm -f *.cmake
+rm -rf CMakeFiles
+
+MY_PROJECT_SOURCE=../../../
+MY_PROJECT_INSTALL=../install.dir
+
+cmake                                                                                                                              \
+-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                      \
+-D CMAKE_BUILD_TYPE=Release                                                                                                        \
+-D DEVICE_BACKEND="AMD"                                                                                                            \
+-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0"       \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                          \
+-D CMAKE_PREFIX_PATH="/opt/rocm"                                                                                                   \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                  \
+${MY_PROJECT_SOURCE}
+
+#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0"       \
+#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -save-temps"       \
+#-D CMAKE_CXX_FLAGS="-O3 --amdgpu-target=gfx906 -mllvm --amdgpu-enable-global-sgpr-addr -mllvm --amdgpu-spill-vgpr-to-agpr=0 -v -gline-tables-only -save-temps"       \
--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
--- a/script/docker-cuda.sh
+++ b/script/docker-cuda.sh
-WORKSPACE=$1
-echo "workspace: " $WORKSPACE
-sudo docker run  -it  -v $WORKSPACE:/root/workspace --group-add sudo --runtime=nvidia     asroy/cuda:10.1-cudnn7-devel-ubuntu18.04-latest /bin/bash
--- a/script/hack_isa.sh
+++ b/script/hack_isa.sh
-# step 1: GET ISA DUMP
-#cd /root/workspace/mlopen/modular_convolution/build/hipcc/build.dir/driver && KMDUMPISA=1 /opt/rocm/hip/bin/hipcc    -I/root/workspace/mlopen/modular_convolution/build/hipcc/build.dir/composable_kernel/include/utility -I/root/workspace/mlopen/modular_convolution/driver/include -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/kernel_algorithm -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/tensor_operation -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/tensor_description -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/utility -I/root/workspace/mlopen/modular_convolution/composable_kernel/include  -gline-tables-only --amdgpu-target=gfx906 -fopenmp=libomp -O3 -DNDEBUG   -std=c++14 -o CMakeFiles/driver.dir/src/driver.cpp.o -c /root/workspace/mlopen/modular_convolution/driver/src/driver.cpp -fno-gpu-rdc
-
-# step 2: HACK ISA
-#cd /root/workspace/mlopen/modular_convolution/build/hipcc/build.dir/driver && KMHACKISA=1 /opt/rocm/hip/bin/hipcc    -I/root/workspace/mlopen/modular_convolution/build/hipcc/build.dir/composable_kernel/include/utility -I/root/workspace/mlopen/modular_convolution/driver/include -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/kernel_algorithm -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/tensor_operation -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/tensor_description -I/root/workspace/mlopen/modular_convolution/composable_kernel/include/utility -I/root/workspace/mlopen/modular_convolution/composable_kernel/include  -gline-tables-only --amdgpu-target=gfx906 -fopenmp=libomp -O3 -DNDEBUG   -std=c++14 -o CMakeFiles/driver.dir/src/driver.cpp.o -c /root/workspace/mlopen/modular_convolution/driver/src/driver.cpp -fno-gpu-rdc
-
-# step 3: LINK
-#/opt/rocm/hip/bin/hipcc   -gline-tables-only --amdgpu-target=gfx906 -fopenmp=libomp -O3 -DNDEBUG   CMakeFiles/driver.dir/src/driver.cpp.o  -o driver -rdynamic libhost.so -Wl,-rpath,/root/workspace/mlopen/modular_convolution/build/hipcc/build.dir/driver 
-
--- a/script/trace.sh
+++ b/script/trace.sh
-#!/bin/bash
-
-/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
--- a/script/tracer-hip.sh
+++ b/script/tracer-hip.sh
-#!/bin/bash
-
-/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10